howard.objects.variants
1import csv 2import gc 3import gzip 4import io 5import multiprocessing 6import os 7import random 8import re 9import shlex 10import sqlite3 11import subprocess 12from tempfile import NamedTemporaryFile, TemporaryDirectory 13import tempfile 14import duckdb 15import json 16import yaml 17import argparse 18import Bio.bgzf as bgzf 19import pandas as pd 20from pyfaidx import Fasta 21import numpy as np 22import vcf 23import logging as log 24import fastparquet as fp 25from multiprocesspandas import applyparallel 26import cyvcf2 27import pyBigWig 28 29from howard.functions.commons import * 30from howard.objects.database import * 31from howard.functions.databases import * 32from howard.functions.utils import * 33 34 35class Variants: 36 37 def __init__( 38 self, 39 conn=None, 40 input: str = None, 41 output: str = None, 42 config: dict = {}, 43 param: dict = {}, 44 load: bool = False, 45 ) -> None: 46 """ 47 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 48 header 49 50 :param conn: the connection to the database 51 :param input: the input file 52 :param output: the output file 53 :param config: a dictionary containing the configuration of the model 54 :param param: a dictionary containing the parameters of the model 55 """ 56 57 # Init variables 58 self.init_variables() 59 60 # Input 61 self.set_input(input) 62 63 # Config 64 self.set_config(config) 65 66 # Param 67 self.set_param(param) 68 69 # Output 70 self.set_output(output) 71 72 # connexion 73 self.set_connexion(conn) 74 75 # Header 76 self.set_header() 77 78 # Samples 79 self.set_samples() 80 81 # Load data 82 if load: 83 self.load_data() 84 85 def set_samples(self, samples: list = None) -> list: 86 """ 87 The function `set_samples` sets the samples attribute of an object to a provided list or 88 retrieves it from a parameter dictionary. 89 90 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 91 input and sets the `samples` attribute of the class to the provided list. If no samples are 92 provided, it tries to get the samples from the class's parameters using the `get_param` method 93 :type samples: list 94 :return: The `samples` list is being returned. 95 """ 96 97 if not samples: 98 samples = self.get_param().get("samples", {}).get("list", None) 99 100 self.samples = samples 101 102 return samples 103 104 def get_samples(self) -> list: 105 """ 106 This function returns a list of samples. 107 :return: The `get_samples` method is returning the `samples` attribute of the object. 108 """ 109 110 return self.samples 111 112 def get_samples_check(self) -> bool: 113 """ 114 This function returns the value of the "check" key within the "samples" dictionary retrieved 115 from the parameters. 116 :return: The method `get_samples_check` is returning the value of the key "check" inside the 117 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 118 method. If the key "check" is not found, it will return `False`. 119 """ 120 121 return self.get_param().get("samples", {}).get("check", True) 122 123 def set_input(self, input: str = None) -> None: 124 """ 125 The function `set_input` takes a file name as input, extracts the name and extension, and sets 126 attributes in the class accordingly. 127 128 :param input: The `set_input` method in the provided code snippet is used to set attributes 129 related to the input file. Here's a breakdown of the parameters and their usage in the method: 130 :type input: str 131 """ 132 133 if input and not isinstance(input, str): 134 try: 135 self.input = input.name 136 except: 137 log.error(f"Input file '{input} in bad format") 138 raise ValueError(f"Input file '{input} in bad format") 139 else: 140 self.input = input 141 142 # Input format 143 if input: 144 input_name, input_extension = os.path.splitext(self.input) 145 self.input_name = input_name 146 self.input_extension = input_extension 147 self.input_format = self.input_extension.replace(".", "") 148 149 def set_config(self, config: dict) -> None: 150 """ 151 The set_config function takes a config object and assigns it as the configuration object for the 152 class. 153 154 :param config: The `config` parameter in the `set_config` function is a dictionary object that 155 contains configuration settings for the class. When you call the `set_config` function with a 156 dictionary object as the argument, it will set that dictionary as the configuration object for 157 the class 158 :type config: dict 159 """ 160 161 self.config = config 162 163 def set_param(self, param: dict) -> None: 164 """ 165 This function sets a parameter object for the class based on the input dictionary. 166 167 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 168 as the `param` attribute of the class instance 169 :type param: dict 170 """ 171 172 self.param = param 173 174 def init_variables(self) -> None: 175 """ 176 This function initializes the variables that will be used in the rest of the class 177 """ 178 179 self.prefix = "howard" 180 self.table_variants = "variants" 181 self.dataframe = None 182 183 self.comparison_map = { 184 "gt": ">", 185 "gte": ">=", 186 "lt": "<", 187 "lte": "<=", 188 "equals": "=", 189 "contains": "SIMILAR TO", 190 } 191 192 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 193 194 self.code_type_map_to_sql = { 195 "Integer": "INTEGER", 196 "String": "VARCHAR", 197 "Float": "FLOAT", 198 "Flag": "VARCHAR", 199 } 200 201 self.index_additionnal_fields = [] 202 203 def get_indexing(self) -> bool: 204 """ 205 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 206 returns False. 207 :return: The value of the indexing parameter. 208 """ 209 210 return self.get_param().get("indexing", False) 211 212 def get_connexion_config(self) -> dict: 213 """ 214 The function `get_connexion_config` returns a dictionary containing the configuration for a 215 connection, including the number of threads and memory limit. 216 :return: a dictionary containing the configuration for the Connexion library. 217 """ 218 219 # config 220 config = self.get_config() 221 222 # Connexion config 223 connexion_config = {} 224 threads = self.get_threads() 225 226 # Threads 227 if threads: 228 connexion_config["threads"] = threads 229 230 # Memory 231 # if config.get("memory", None): 232 # connexion_config["memory_limit"] = config.get("memory") 233 if self.get_memory(): 234 connexion_config["memory_limit"] = self.get_memory() 235 236 # Temporary directory 237 if config.get("tmp", None): 238 connexion_config["temp_directory"] = config.get("tmp") 239 240 # Access 241 if config.get("access", None): 242 access = config.get("access") 243 if access in ["RO"]: 244 access = "READ_ONLY" 245 elif access in ["RW"]: 246 access = "READ_WRITE" 247 connexion_db = self.get_connexion_db() 248 if connexion_db in ":memory:": 249 access = "READ_WRITE" 250 connexion_config["access_mode"] = access 251 252 return connexion_config 253 254 def get_duckdb_settings(self) -> dict: 255 """ 256 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 257 string. 258 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 259 """ 260 261 # config 262 config = self.get_config() 263 264 # duckdb settings 265 duckdb_settings_dict = {} 266 if config.get("duckdb_settings", None): 267 duckdb_settings = config.get("duckdb_settings") 268 duckdb_settings = full_path(duckdb_settings) 269 # duckdb setting is a file 270 if os.path.exists(duckdb_settings): 271 with open(duckdb_settings) as json_file: 272 duckdb_settings_dict = yaml.safe_load(json_file) 273 # duckdb settings is a string 274 else: 275 duckdb_settings_dict = json.loads(duckdb_settings) 276 277 return duckdb_settings_dict 278 279 def set_connexion_db(self) -> str: 280 """ 281 The function `set_connexion_db` returns the appropriate database connection string based on the 282 input format and connection type. 283 :return: the value of the variable `connexion_db`. 284 """ 285 286 # Default connexion db 287 default_connexion_db = ":memory:" 288 289 # Find connexion db 290 if self.get_input_format() in ["db", "duckdb"]: 291 connexion_db = self.get_input() 292 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 293 connexion_db = default_connexion_db 294 elif self.get_connexion_type() in ["tmpfile"]: 295 tmp_name = tempfile.mkdtemp( 296 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 297 ) 298 connexion_db = f"{tmp_name}/tmp.db" 299 elif self.get_connexion_type() != "": 300 connexion_db = self.get_connexion_type() 301 else: 302 connexion_db = default_connexion_db 303 304 # Set connexion db 305 self.connexion_db = connexion_db 306 307 return connexion_db 308 309 def set_connexion(self, conn) -> None: 310 """ 311 The function `set_connexion` creates a connection to a database, with options for different 312 database formats and settings. 313 314 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 315 database. If a connection is not provided, a new connection to an in-memory database is created. 316 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 317 sqlite 318 """ 319 320 # Connexion db 321 connexion_db = self.set_connexion_db() 322 323 # Connexion config 324 connexion_config = self.get_connexion_config() 325 326 # Connexion format 327 connexion_format = self.get_config().get("connexion_format", "duckdb") 328 # Set connexion format 329 self.connexion_format = connexion_format 330 331 # Connexion 332 if not conn: 333 if connexion_format in ["duckdb"]: 334 conn = duckdb.connect(connexion_db, config=connexion_config) 335 # duckDB settings 336 duckdb_settings = self.get_duckdb_settings() 337 if duckdb_settings: 338 for setting in duckdb_settings: 339 setting_value = duckdb_settings.get(setting) 340 if isinstance(setting_value, str): 341 setting_value = f"'{setting_value}'" 342 conn.execute(f"PRAGMA {setting}={setting_value};") 343 elif connexion_format in ["sqlite"]: 344 conn = sqlite3.connect(connexion_db) 345 346 # Set connexion 347 self.conn = conn 348 349 # Log 350 log.debug(f"connexion_format: {connexion_format}") 351 log.debug(f"connexion_db: {connexion_db}") 352 log.debug(f"connexion config: {connexion_config}") 353 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 354 355 def set_output(self, output: str = None) -> None: 356 """ 357 The `set_output` function in Python sets the output file based on the input or a specified key 358 in the config file, extracting the output name, extension, and format. 359 360 :param output: The `output` parameter in the `set_output` method is used to specify the name of 361 the output file. If the config file has an 'output' key, the method sets the output to the value 362 of that key. If no output is provided, it sets the output to `None` 363 :type output: str 364 """ 365 366 if output and not isinstance(output, str): 367 self.output = output.name 368 else: 369 self.output = output 370 371 # Output format 372 if self.output: 373 output_name, output_extension = os.path.splitext(self.output) 374 self.output_name = output_name 375 self.output_extension = output_extension 376 self.output_format = self.output_extension.replace(".", "") 377 else: 378 self.output_name = None 379 self.output_extension = None 380 self.output_format = None 381 382 def set_header(self) -> None: 383 """ 384 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 385 """ 386 387 input_file = self.get_input() 388 default_header_list = [ 389 "##fileformat=VCFv4.2", 390 "#CHROM POS ID REF ALT QUAL FILTER INFO", 391 ] 392 393 # Full path 394 input_file = full_path(input_file) 395 396 if input_file: 397 398 input_format = self.get_input_format() 399 input_compressed = self.get_input_compressed() 400 config = self.get_config() 401 header_list = default_header_list 402 if input_format in [ 403 "vcf", 404 "hdr", 405 "tsv", 406 "csv", 407 "psv", 408 "parquet", 409 "db", 410 "duckdb", 411 ]: 412 # header provided in param 413 if config.get("header_file", None): 414 with open(config.get("header_file"), "rt") as f: 415 header_list = self.read_vcf_header(f) 416 # within a vcf file format (header within input file itsself) 417 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 418 # within a compressed vcf file format (.vcf.gz) 419 if input_compressed: 420 with bgzf.open(input_file, "rt") as f: 421 header_list = self.read_vcf_header(f) 422 # within an uncompressed vcf file format (.vcf) 423 else: 424 with open(input_file, "rt") as f: 425 header_list = self.read_vcf_header(f) 426 # header provided in default external file .hdr 427 elif os.path.exists((input_file + ".hdr")): 428 with open(input_file + ".hdr", "rt") as f: 429 header_list = self.read_vcf_header(f) 430 else: 431 try: # Try to get header info fields and file columns 432 433 with tempfile.TemporaryDirectory() as tmpdir: 434 435 # Create database 436 db_for_header = Database(database=input_file) 437 438 # Get header columns for infos fields 439 db_header_from_columns = ( 440 db_for_header.get_header_from_columns() 441 ) 442 443 # Get real columns in the file 444 db_header_columns = db_for_header.get_columns() 445 446 # Write header file 447 header_file_tmp = os.path.join(tmpdir, "header") 448 f = open(header_file_tmp, "w") 449 vcf.Writer(f, db_header_from_columns) 450 f.close() 451 452 # Replace #CHROM line with rel columns 453 header_list = db_for_header.read_header_file( 454 header_file=header_file_tmp 455 ) 456 header_list[-1] = "\t".join(db_header_columns) 457 458 except: 459 460 log.warning( 461 f"No header for file {input_file}. Set as default VCF header" 462 ) 463 header_list = default_header_list 464 465 else: # try for unknown format ? 466 467 log.error(f"Input file format '{input_format}' not available") 468 raise ValueError(f"Input file format '{input_format}' not available") 469 470 if not header_list: 471 header_list = default_header_list 472 473 # header as list 474 self.header_list = header_list 475 476 # header as VCF object 477 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 478 479 else: 480 481 self.header_list = None 482 self.header_vcf = None 483 484 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 485 """ 486 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 487 DataFrame based on the connection format. 488 489 :param query: The `query` parameter in the `get_query_to_df` function is a string that 490 represents the SQL query you want to execute. This query will be used to fetch data from a 491 database and convert it into a pandas DataFrame 492 :type query: str 493 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 494 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 495 function will only fetch up to that number of rows from the database query result. If no limit 496 is specified, 497 :type limit: int 498 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 499 """ 500 501 # Connexion format 502 connexion_format = self.get_connexion_format() 503 504 # Limit in query 505 if limit: 506 pd.set_option("display.max_rows", limit) 507 if connexion_format in ["duckdb"]: 508 df = ( 509 self.conn.execute(query) 510 .fetch_record_batch(limit) 511 .read_next_batch() 512 .to_pandas() 513 ) 514 elif connexion_format in ["sqlite"]: 515 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 516 517 # Full query 518 else: 519 if connexion_format in ["duckdb"]: 520 df = self.conn.execute(query).df() 521 elif connexion_format in ["sqlite"]: 522 df = pd.read_sql_query(query, self.conn) 523 524 return df 525 526 def get_overview(self) -> None: 527 """ 528 The function prints the input, output, config, and dataframe of the current object 529 """ 530 table_variants_from = self.get_table_variants(clause="from") 531 sql_columns = self.get_header_columns_as_sql() 532 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 533 df = self.get_query_to_df(sql_query_export) 534 log.info( 535 "Input: " 536 + str(self.get_input()) 537 + " [" 538 + str(str(self.get_input_format())) 539 + "]" 540 ) 541 log.info( 542 "Output: " 543 + str(self.get_output()) 544 + " [" 545 + str(str(self.get_output_format())) 546 + "]" 547 ) 548 log.info("Config: ") 549 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 550 "\n" 551 ): 552 log.info("\t" + str(d)) 553 log.info("Param: ") 554 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 555 "\n" 556 ): 557 log.info("\t" + str(d)) 558 log.info("Sample list: " + str(self.get_header_sample_list())) 559 log.info("Dataframe: ") 560 for d in str(df).split("\n"): 561 log.info("\t" + str(d)) 562 563 # garbage collector 564 del df 565 gc.collect() 566 567 return None 568 569 def get_stats(self) -> dict: 570 """ 571 The `get_stats` function calculates and returns various statistics of the current object, 572 including information about the input file, variants, samples, header fields, quality, and 573 SNVs/InDels. 574 :return: a dictionary containing various statistics of the current object. The dictionary has 575 the following structure: 576 """ 577 578 # Log 579 log.info(f"Stats Calculation...") 580 581 # table varaints 582 table_variants_from = self.get_table_variants() 583 584 # stats dict 585 stats = {"Infos": {}} 586 587 ### File 588 input_file = self.get_input() 589 stats["Infos"]["Input file"] = input_file 590 591 # Header 592 header_infos = self.get_header().infos 593 header_formats = self.get_header().formats 594 header_infos_list = list(header_infos) 595 header_formats_list = list(header_formats) 596 597 ### Variants 598 599 stats["Variants"] = {} 600 601 # Variants by chr 602 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 603 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 604 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 605 by=["CHROM"], kind="quicksort" 606 ) 607 608 # Total number of variants 609 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 610 611 # Calculate percentage 612 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 613 lambda x: (x / nb_of_variants) 614 ) 615 616 stats["Variants"]["Number of variants by chromosome"] = ( 617 nb_of_variants_by_chrom.to_dict(orient="index") 618 ) 619 620 stats["Infos"]["Number of variants"] = int(nb_of_variants) 621 622 ### Samples 623 624 # Init 625 samples = {} 626 nb_of_samples = 0 627 628 # Check Samples 629 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 630 log.debug(f"Check samples...") 631 for sample in self.get_header_sample_list(): 632 sql_query_samples = f""" 633 SELECT '{sample}' as sample, 634 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 635 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 636 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 637 FROM {table_variants_from} 638 WHERE ( 639 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 640 AND 641 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 642 ) 643 GROUP BY genotype 644 """ 645 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 646 sample_genotype_count = sql_query_genotype_df["count"].sum() 647 if len(sql_query_genotype_df): 648 nb_of_samples += 1 649 samples[f"{sample} - {sample_genotype_count} variants"] = ( 650 sql_query_genotype_df.to_dict(orient="index") 651 ) 652 653 stats["Samples"] = samples 654 stats["Infos"]["Number of samples"] = nb_of_samples 655 656 # # 657 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 658 # stats["Infos"]["Number of samples"] = nb_of_samples 659 # elif nb_of_samples: 660 # stats["Infos"]["Number of samples"] = "not a VCF format" 661 662 ### INFO and FORMAT fields 663 header_types_df = {} 664 header_types_list = { 665 "List of INFO fields": header_infos, 666 "List of FORMAT fields": header_formats, 667 } 668 i = 0 669 for header_type in header_types_list: 670 671 header_type_infos = header_types_list.get(header_type) 672 header_infos_dict = {} 673 674 for info in header_type_infos: 675 676 i += 1 677 header_infos_dict[i] = {} 678 679 # ID 680 header_infos_dict[i]["id"] = info 681 682 # num 683 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 684 if header_type_infos[info].num in genotype_map.keys(): 685 header_infos_dict[i]["Number"] = genotype_map.get( 686 header_type_infos[info].num 687 ) 688 else: 689 header_infos_dict[i]["Number"] = header_type_infos[info].num 690 691 # type 692 if header_type_infos[info].type: 693 header_infos_dict[i]["Type"] = header_type_infos[info].type 694 else: 695 header_infos_dict[i]["Type"] = "." 696 697 # desc 698 if header_type_infos[info].desc != None: 699 header_infos_dict[i]["Description"] = header_type_infos[info].desc 700 else: 701 header_infos_dict[i]["Description"] = "" 702 703 if len(header_infos_dict): 704 header_types_df[header_type] = pd.DataFrame.from_dict( 705 header_infos_dict, orient="index" 706 ).to_dict(orient="index") 707 708 # Stats 709 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 710 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 711 stats["Header"] = header_types_df 712 713 ### QUAL 714 if "QUAL" in self.get_header_columns(): 715 sql_query_qual = f""" 716 SELECT 717 avg(CAST(QUAL AS INTEGER)) AS Average, 718 min(CAST(QUAL AS INTEGER)) AS Minimum, 719 max(CAST(QUAL AS INTEGER)) AS Maximum, 720 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 721 median(CAST(QUAL AS INTEGER)) AS Median, 722 variance(CAST(QUAL AS INTEGER)) AS Variance 723 FROM {table_variants_from} 724 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 725 """ 726 727 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 728 stats["Quality"] = {"Stats": qual} 729 730 ### SNV and InDel 731 732 sql_query_snv = f""" 733 734 SELECT Type, count FROM ( 735 736 SELECT 737 'Total' AS Type, 738 count(*) AS count 739 FROM {table_variants_from} 740 741 UNION 742 743 SELECT 744 'MNV' AS Type, 745 count(*) AS count 746 FROM {table_variants_from} 747 WHERE len(REF) > 1 AND len(ALT) > 1 748 AND len(REF) = len(ALT) 749 750 UNION 751 752 SELECT 753 'InDel' AS Type, 754 count(*) AS count 755 FROM {table_variants_from} 756 WHERE len(REF) > 1 OR len(ALT) > 1 757 AND len(REF) != len(ALT) 758 759 UNION 760 761 SELECT 762 'SNV' AS Type, 763 count(*) AS count 764 FROM {table_variants_from} 765 WHERE len(REF) = 1 AND len(ALT) = 1 766 767 ) 768 769 ORDER BY count DESC 770 771 """ 772 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 773 774 sql_query_snv_substitution = f""" 775 SELECT 776 concat(REF, '>', ALT) AS 'Substitution', 777 count(*) AS count 778 FROM {table_variants_from} 779 WHERE len(REF) = 1 AND len(ALT) = 1 780 GROUP BY REF, ALT 781 ORDER BY count(*) DESC 782 """ 783 snv_substitution = ( 784 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 785 ) 786 stats["Variants"]["Counts"] = snv_indel 787 stats["Variants"]["Substitutions"] = snv_substitution 788 789 return stats 790 791 def stats_to_file(self, file: str = None) -> str: 792 """ 793 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 794 into a JSON object, and writes the JSON object to the specified file. 795 796 :param file: The `file` parameter is a string that represents the file path where the JSON data 797 will be written 798 :type file: str 799 :return: the name of the file that was written to. 800 """ 801 802 # Get stats 803 stats = self.get_stats() 804 805 # Serializing json 806 json_object = json.dumps(stats, indent=4) 807 808 # Writing to sample.json 809 with open(file, "w") as outfile: 810 outfile.write(json_object) 811 812 return file 813 814 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 815 """ 816 The `print_stats` function generates a markdown file and prints the statistics contained in a 817 JSON file in a formatted manner. 818 819 :param output_file: The `output_file` parameter is a string that specifies the path and filename 820 of the output file where the stats will be printed in Markdown format. If no `output_file` is 821 provided, a temporary directory will be created and the stats will be saved in a file named 822 "stats.md" within that 823 :type output_file: str 824 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 825 file where the statistics will be saved. If no value is provided, a temporary directory will be 826 created and a default file name "stats.json" will be used 827 :type json_file: str 828 :return: The function `print_stats` does not return any value. It has a return type annotation 829 of `None`. 830 """ 831 832 # Full path 833 output_file = full_path(output_file) 834 json_file = full_path(json_file) 835 836 with tempfile.TemporaryDirectory() as tmpdir: 837 838 # Files 839 if not output_file: 840 output_file = os.path.join(tmpdir, "stats.md") 841 if not json_file: 842 json_file = os.path.join(tmpdir, "stats.json") 843 844 # Create folders 845 if not os.path.exists(os.path.dirname(output_file)): 846 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 847 if not os.path.exists(os.path.dirname(json_file)): 848 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 849 850 # Create stats JSON file 851 stats_file = self.stats_to_file(file=json_file) 852 853 # Print stats file 854 with open(stats_file) as f: 855 stats = yaml.safe_load(f) 856 857 # Output 858 output_title = [] 859 output_index = [] 860 output = [] 861 862 # Title 863 output_title.append("# HOWARD Stats") 864 865 # Index 866 output_index.append("## Index") 867 868 # Process sections 869 for section in stats: 870 infos = stats.get(section) 871 section_link = "#" + section.lower().replace(" ", "-") 872 output.append(f"## {section}") 873 output_index.append(f"- [{section}]({section_link})") 874 875 if len(infos): 876 for info in infos: 877 try: 878 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 879 is_df = True 880 except: 881 try: 882 df = pd.DataFrame.from_dict( 883 json.loads((infos.get(info))), orient="index" 884 ) 885 is_df = True 886 except: 887 is_df = False 888 if is_df: 889 output.append(f"### {info}") 890 info_link = "#" + info.lower().replace(" ", "-") 891 output_index.append(f" - [{info}]({info_link})") 892 output.append(f"{df.to_markdown(index=False)}") 893 else: 894 output.append(f"- {info}: {infos.get(info)}") 895 else: 896 output.append(f"NA") 897 898 # Write stats in markdown file 899 with open(output_file, "w") as fp: 900 for item in output_title: 901 fp.write("%s\n" % item) 902 for item in output_index: 903 fp.write("%s\n" % item) 904 for item in output: 905 fp.write("%s\n" % item) 906 907 # Output stats in markdown 908 print("") 909 print("\n\n".join(output_title)) 910 print("") 911 print("\n\n".join(output)) 912 print("") 913 914 return None 915 916 def get_input(self) -> str: 917 """ 918 It returns the value of the input variable. 919 :return: The input is being returned. 920 """ 921 return self.input 922 923 def get_input_format(self, input_file: str = None) -> str: 924 """ 925 This function returns the format of the input variable, either from the provided input file or 926 by prompting for input. 927 928 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 929 represents the file path of the input file. If no `input_file` is provided when calling the 930 method, it will default to `None` 931 :type input_file: str 932 :return: The format of the input variable is being returned. 933 """ 934 935 if not input_file: 936 input_file = self.get_input() 937 input_format = get_file_format(input_file) 938 return input_format 939 940 def get_input_compressed(self, input_file: str = None) -> str: 941 """ 942 The function `get_input_compressed` returns the format of the input variable after compressing 943 it. 944 945 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 946 that represents the file path of the input file. If no `input_file` is provided when calling the 947 method, it will default to `None` and the method will then call `self.get_input()` to 948 :type input_file: str 949 :return: The function `get_input_compressed` returns the compressed format of the input 950 variable. 951 """ 952 953 if not input_file: 954 input_file = self.get_input() 955 input_compressed = get_file_compressed(input_file) 956 return input_compressed 957 958 def get_output(self) -> str: 959 """ 960 It returns the output of the neuron. 961 :return: The output of the neural network. 962 """ 963 964 return self.output 965 966 def get_output_format(self, output_file: str = None) -> str: 967 """ 968 The function `get_output_format` returns the format of the input variable or the output file if 969 provided. 970 971 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 972 that represents the file path of the output file. If no `output_file` is provided when calling 973 the method, it will default to the output obtained from the `get_output` method of the class 974 instance. The 975 :type output_file: str 976 :return: The format of the input variable is being returned. 977 """ 978 979 if not output_file: 980 output_file = self.get_output() 981 output_format = get_file_format(output_file) 982 983 return output_format 984 985 def get_config(self) -> dict: 986 """ 987 It returns the config 988 :return: The config variable is being returned. 989 """ 990 return self.config 991 992 def get_param(self) -> dict: 993 """ 994 It returns the param 995 :return: The param variable is being returned. 996 """ 997 return self.param 998 999 def get_connexion_db(self) -> str: 1000 """ 1001 It returns the connexion_db attribute of the object 1002 :return: The connexion_db is being returned. 1003 """ 1004 return self.connexion_db 1005 1006 def get_prefix(self) -> str: 1007 """ 1008 It returns the prefix of the object. 1009 :return: The prefix is being returned. 1010 """ 1011 return self.prefix 1012 1013 def get_table_variants(self, clause: str = "select") -> str: 1014 """ 1015 This function returns the table_variants attribute of the object 1016 1017 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1018 defaults to select (optional) 1019 :return: The table_variants attribute of the object. 1020 """ 1021 1022 # Access 1023 access = self.get_config().get("access", None) 1024 1025 # Clauses "select", "where", "update" 1026 if clause in ["select", "where", "update"]: 1027 table_variants = self.table_variants 1028 # Clause "from" 1029 elif clause in ["from"]: 1030 # For Read Only 1031 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1032 input_file = self.get_input() 1033 table_variants = f"'{input_file}' as variants" 1034 # For Read Write 1035 else: 1036 table_variants = f"{self.table_variants} as variants" 1037 else: 1038 table_variants = self.table_variants 1039 return table_variants 1040 1041 def get_tmp_dir(self) -> str: 1042 """ 1043 The function `get_tmp_dir` returns the temporary directory path based on configuration 1044 parameters or a default path. 1045 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1046 configuration, parameters, and a default value of "/tmp". 1047 """ 1048 1049 return get_tmp( 1050 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1051 ) 1052 1053 def get_connexion_type(self) -> str: 1054 """ 1055 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1056 1057 :return: The connexion type is being returned. 1058 """ 1059 return self.get_config().get("connexion_type", "memory") 1060 1061 def get_connexion(self): 1062 """ 1063 It returns the connection object 1064 1065 :return: The connection object. 1066 """ 1067 return self.conn 1068 1069 def close_connexion(self) -> None: 1070 """ 1071 This function closes the connection to the database. 1072 :return: The connection is being closed. 1073 """ 1074 return self.conn.close() 1075 1076 def get_header(self, type: str = "vcf"): 1077 """ 1078 This function returns the header of the VCF file as a list of strings 1079 1080 :param type: the type of header you want to get, defaults to vcf (optional) 1081 :return: The header of the vcf file. 1082 """ 1083 1084 if self.header_vcf: 1085 if type == "vcf": 1086 return self.header_vcf 1087 elif type == "list": 1088 return self.header_list 1089 else: 1090 if type == "vcf": 1091 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1092 return header 1093 elif type == "list": 1094 return vcf_required 1095 1096 def get_header_infos_list(self) -> list: 1097 """ 1098 This function retrieves a list of information fields from the header. 1099 :return: A list of information fields from the header. 1100 """ 1101 1102 # Init 1103 infos_list = [] 1104 1105 for field in self.get_header().infos: 1106 infos_list.append(field) 1107 1108 return infos_list 1109 1110 def get_header_length(self, file: str = None) -> int: 1111 """ 1112 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1113 line. 1114 1115 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1116 header file. If this argument is provided, the function will read the header from the specified 1117 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1118 :type file: str 1119 :return: the length of the header list, excluding the #CHROM line. 1120 """ 1121 1122 if file: 1123 return len(self.read_vcf_header_file(file=file)) - 1 1124 elif self.get_header(type="list"): 1125 return len(self.get_header(type="list")) - 1 1126 else: 1127 return 0 1128 1129 def get_header_columns(self) -> str: 1130 """ 1131 This function returns the header list of a VCF 1132 1133 :return: The length of the header list. 1134 """ 1135 if self.get_header(): 1136 return self.get_header(type="list")[-1] 1137 else: 1138 return "" 1139 1140 def get_header_columns_as_list(self) -> list: 1141 """ 1142 This function returns the header list of a VCF 1143 1144 :return: The length of the header list. 1145 """ 1146 if self.get_header(): 1147 return self.get_header_columns().strip().split("\t") 1148 else: 1149 return [] 1150 1151 def get_header_columns_as_sql(self) -> str: 1152 """ 1153 This function retruns header length (without #CHROM line) 1154 1155 :return: The length of the header list. 1156 """ 1157 sql_column_list = [] 1158 for col in self.get_header_columns_as_list(): 1159 sql_column_list.append(f'"{col}"') 1160 return ",".join(sql_column_list) 1161 1162 def get_header_sample_list( 1163 self, check: bool = False, samples: list = None, samples_force: bool = False 1164 ) -> list: 1165 """ 1166 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1167 checking and filtering based on input parameters. 1168 1169 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1170 parameter that determines whether to check if the samples in the list are properly defined as 1171 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1172 list is defined as a, defaults to False 1173 :type check: bool (optional) 1174 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1175 allows you to specify a subset of samples from the header. If you provide a list of sample 1176 names, the function will check if each sample is defined in the header. If a sample is not found 1177 in the 1178 :type samples: list 1179 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1180 a boolean parameter that determines whether to force the function to return the sample list 1181 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1182 function will return the sample list without performing, defaults to False 1183 :type samples_force: bool (optional) 1184 :return: The function `get_header_sample_list` returns a list of samples based on the input 1185 parameters and conditions specified in the function. 1186 """ 1187 1188 # Init 1189 samples_list = [] 1190 1191 if samples is None: 1192 samples_list = self.header_vcf.samples 1193 else: 1194 samples_checked = [] 1195 for sample in samples: 1196 if sample in self.header_vcf.samples: 1197 samples_checked.append(sample) 1198 else: 1199 log.warning(f"Sample '{sample}' not defined in header") 1200 samples_list = samples_checked 1201 1202 # Force sample list without checking if is_genotype_column 1203 if samples_force: 1204 log.warning(f"Samples {samples_list} not checked if genotypes") 1205 return samples_list 1206 1207 if check: 1208 samples_checked = [] 1209 for sample in samples_list: 1210 if self.is_genotype_column(column=sample): 1211 samples_checked.append(sample) 1212 else: 1213 log.warning( 1214 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1215 ) 1216 samples_list = samples_checked 1217 1218 # Return samples list 1219 return samples_list 1220 1221 def is_genotype_column(self, column: str = None) -> bool: 1222 """ 1223 This function checks if a given column is a genotype column in a database. 1224 1225 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1226 represents the column name in a database table. This method checks if the specified column is a 1227 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1228 method of 1229 :type column: str 1230 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1231 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1232 column name and returns the result. If the `column` parameter is None, it returns False. 1233 """ 1234 1235 if column is not None: 1236 return Database(database=self.get_input()).is_genotype_column(column=column) 1237 else: 1238 return False 1239 1240 def get_verbose(self) -> bool: 1241 """ 1242 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1243 exist 1244 1245 :return: The value of the key "verbose" in the config dictionary. 1246 """ 1247 return self.get_config().get("verbose", False) 1248 1249 def get_connexion_format(self) -> str: 1250 """ 1251 It returns the connexion format of the object. 1252 :return: The connexion_format is being returned. 1253 """ 1254 connexion_format = self.connexion_format 1255 if connexion_format not in ["duckdb", "sqlite"]: 1256 log.error(f"Unknown connexion format {connexion_format}") 1257 raise ValueError(f"Unknown connexion format {connexion_format}") 1258 else: 1259 return connexion_format 1260 1261 def insert_file_to_table( 1262 self, 1263 file, 1264 columns: str, 1265 header_len: int = 0, 1266 sep: str = "\t", 1267 chunksize: int = 1000000, 1268 ) -> None: 1269 """ 1270 The function reads a file in chunks and inserts each chunk into a table based on the specified 1271 database format. 1272 1273 :param file: The `file` parameter is the file that you want to load into a table. It should be 1274 the path to the file on your system 1275 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1276 should contain the names of the columns in the table where the data will be inserted. The column 1277 names should be separated by commas within the string. For example, if you have columns named 1278 "id", "name 1279 :type columns: str 1280 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1281 the number of lines to skip at the beginning of the file before reading the actual data. This 1282 parameter allows you to skip any header information present in the file before processing the 1283 data, defaults to 0 1284 :type header_len: int (optional) 1285 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1286 separator character that is used in the file being read. In this case, the default separator is 1287 set to `\t`, which represents a tab character. You can change this parameter to a different 1288 separator character if, defaults to \t 1289 :type sep: str (optional) 1290 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1291 when processing the file in chunks. In the provided code snippet, the default value for 1292 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1293 to 1000000 1294 :type chunksize: int (optional) 1295 """ 1296 1297 # Config 1298 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1299 connexion_format = self.get_connexion_format() 1300 1301 log.debug("chunksize: " + str(chunksize)) 1302 1303 if chunksize: 1304 for chunk in pd.read_csv( 1305 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1306 ): 1307 if connexion_format in ["duckdb"]: 1308 sql_insert_into = ( 1309 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1310 ) 1311 self.conn.execute(sql_insert_into) 1312 elif connexion_format in ["sqlite"]: 1313 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1314 1315 def load_data( 1316 self, 1317 input_file: str = None, 1318 drop_variants_table: bool = False, 1319 sample_size: int = 20480, 1320 ) -> None: 1321 """ 1322 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1323 table before loading the data and specify a sample size. 1324 1325 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1326 table 1327 :type input_file: str 1328 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1329 determines whether the variants table should be dropped before loading the data. If set to 1330 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1331 not be dropped, defaults to False 1332 :type drop_variants_table: bool (optional) 1333 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1334 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1335 20480 1336 :type sample_size: int (optional) 1337 """ 1338 1339 log.info("Loading...") 1340 1341 # change input file 1342 if input_file: 1343 self.set_input(input_file) 1344 self.set_header() 1345 1346 # drop variants table 1347 if drop_variants_table: 1348 self.drop_variants_table() 1349 1350 # get table variants 1351 table_variants = self.get_table_variants() 1352 1353 # Access 1354 access = self.get_config().get("access", None) 1355 log.debug(f"access: {access}") 1356 1357 # Input format and compress 1358 input_format = self.get_input_format() 1359 input_compressed = self.get_input_compressed() 1360 log.debug(f"input_format: {input_format}") 1361 log.debug(f"input_compressed: {input_compressed}") 1362 1363 # input_compressed_format 1364 if input_compressed: 1365 input_compressed_format = "gzip" 1366 else: 1367 input_compressed_format = "none" 1368 log.debug(f"input_compressed_format: {input_compressed_format}") 1369 1370 # Connexion format 1371 connexion_format = self.get_connexion_format() 1372 1373 # Sample size 1374 if not sample_size: 1375 sample_size = -1 1376 log.debug(f"sample_size: {sample_size}") 1377 1378 # Load data 1379 log.debug(f"Load Data from {input_format}") 1380 1381 # DuckDB connexion 1382 if connexion_format in ["duckdb"]: 1383 1384 # Database already exists 1385 if self.input_format in ["db", "duckdb"]: 1386 1387 if connexion_format in ["duckdb"]: 1388 log.debug(f"Input file format '{self.input_format}' duckDB") 1389 else: 1390 log.error( 1391 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1392 ) 1393 raise ValueError( 1394 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1395 ) 1396 1397 # Load from existing database format 1398 else: 1399 1400 try: 1401 # Create Table or View 1402 database = Database(database=self.input) 1403 sql_from = database.get_sql_from(sample_size=sample_size) 1404 1405 if access in ["RO"]: 1406 sql_load = ( 1407 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1408 ) 1409 else: 1410 sql_load = ( 1411 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1412 ) 1413 self.conn.execute(sql_load) 1414 1415 except: 1416 # Format not available 1417 log.error(f"Input file format '{self.input_format}' not available") 1418 raise ValueError( 1419 f"Input file format '{self.input_format}' not available" 1420 ) 1421 1422 # SQLite connexion 1423 elif connexion_format in ["sqlite"] and input_format in [ 1424 "vcf", 1425 "tsv", 1426 "csv", 1427 "psv", 1428 ]: 1429 1430 # Main structure 1431 structure = { 1432 "#CHROM": "VARCHAR", 1433 "POS": "INTEGER", 1434 "ID": "VARCHAR", 1435 "REF": "VARCHAR", 1436 "ALT": "VARCHAR", 1437 "QUAL": "VARCHAR", 1438 "FILTER": "VARCHAR", 1439 "INFO": "VARCHAR", 1440 } 1441 1442 # Strcuture with samples 1443 structure_complete = structure 1444 if self.get_header_sample_list(): 1445 structure["FORMAT"] = "VARCHAR" 1446 for sample in self.get_header_sample_list(): 1447 structure_complete[sample] = "VARCHAR" 1448 1449 # Columns list for create and insert 1450 sql_create_table_columns = [] 1451 sql_create_table_columns_list = [] 1452 for column in structure_complete: 1453 column_type = structure_complete[column] 1454 sql_create_table_columns.append( 1455 f'"{column}" {column_type} default NULL' 1456 ) 1457 sql_create_table_columns_list.append(f'"{column}"') 1458 1459 # Create database 1460 log.debug(f"Create Table {table_variants}") 1461 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1462 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1463 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1464 self.conn.execute(sql_create_table) 1465 1466 # chunksize define length of file chunk load file 1467 chunksize = 100000 1468 1469 # delimiter 1470 delimiter = file_format_delimiters.get(input_format, "\t") 1471 1472 # Load the input file 1473 with open(self.input, "rt") as input_file: 1474 1475 # Use the appropriate file handler based on the input format 1476 if input_compressed: 1477 input_file = bgzf.open(self.input, "rt") 1478 if input_format in ["vcf"]: 1479 header_len = self.get_header_length() 1480 else: 1481 header_len = 0 1482 1483 # Insert the file contents into a table 1484 self.insert_file_to_table( 1485 input_file, 1486 columns=sql_create_table_columns_list_sql, 1487 header_len=header_len, 1488 sep=delimiter, 1489 chunksize=chunksize, 1490 ) 1491 1492 else: 1493 log.error( 1494 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1495 ) 1496 raise ValueError( 1497 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1498 ) 1499 1500 # Explode INFOS fields into table fields 1501 if self.get_explode_infos(): 1502 self.explode_infos( 1503 prefix=self.get_explode_infos_prefix(), 1504 fields=self.get_explode_infos_fields(), 1505 force=True, 1506 ) 1507 1508 # Create index after insertion 1509 self.create_indexes() 1510 1511 def get_explode_infos(self) -> bool: 1512 """ 1513 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1514 to False if it is not set. 1515 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1516 value. If the parameter is not present, it will return False. 1517 """ 1518 1519 return self.get_param().get("explode", {}).get("explode_infos", False) 1520 1521 def get_explode_infos_fields( 1522 self, 1523 explode_infos_fields: str = None, 1524 remove_fields_not_in_header: bool = False, 1525 ) -> list: 1526 """ 1527 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1528 the input parameter `explode_infos_fields`. 1529 1530 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1531 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1532 comma-separated list of field names to explode 1533 :type explode_infos_fields: str 1534 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1535 flag that determines whether to remove fields that are not present in the header. If it is set 1536 to `True`, any field that is not in the header will be excluded from the list of exploded 1537 information fields. If it is set to `, defaults to False 1538 :type remove_fields_not_in_header: bool (optional) 1539 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1540 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1541 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1542 Otherwise, it returns a list of exploded information fields after removing any spaces and 1543 splitting the string by commas. 1544 """ 1545 1546 # If no fields, get it in param 1547 if not explode_infos_fields: 1548 explode_infos_fields = ( 1549 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1550 ) 1551 1552 # If no fields, defined as all fields in header using keyword 1553 if not explode_infos_fields: 1554 explode_infos_fields = "*" 1555 1556 # If fields list not empty 1557 if explode_infos_fields: 1558 1559 # Input fields list 1560 if isinstance(explode_infos_fields, str): 1561 fields_input = explode_infos_fields.split(",") 1562 elif isinstance(explode_infos_fields, list): 1563 fields_input = explode_infos_fields 1564 else: 1565 fields_input = [] 1566 1567 # Fields list without * keyword 1568 fields_without_all = fields_input.copy() 1569 if "*".casefold() in (item.casefold() for item in fields_without_all): 1570 fields_without_all.remove("*") 1571 1572 # Fields in header 1573 fields_in_header = sorted(list(set(self.get_header().infos))) 1574 1575 # Construct list of fields 1576 fields_output = [] 1577 for field in fields_input: 1578 1579 # Strip field 1580 field = field.strip() 1581 1582 # format keyword * in regex 1583 if field.upper() in ["*"]: 1584 field = ".*" 1585 1586 # Find all fields with pattern 1587 r = re.compile(field) 1588 fields_search = sorted(list(filter(r.match, fields_in_header))) 1589 1590 # Remove fields input from search 1591 if field in fields_search: 1592 fields_search = [field] 1593 elif fields_search != [field]: 1594 fields_search = sorted( 1595 list(set(fields_search).difference(fields_input)) 1596 ) 1597 1598 # If field is not in header (avoid not well formatted header) 1599 if not fields_search and not remove_fields_not_in_header: 1600 fields_search = [field] 1601 1602 # Add found fields 1603 for new_field in fields_search: 1604 # Add field, if not already exists, and if it is in header (if asked) 1605 if ( 1606 new_field not in fields_output 1607 and ( 1608 not remove_fields_not_in_header 1609 or new_field in fields_in_header 1610 ) 1611 and new_field not in [".*"] 1612 ): 1613 fields_output.append(new_field) 1614 1615 return fields_output 1616 1617 else: 1618 1619 return [] 1620 1621 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1622 """ 1623 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1624 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1625 not provided. 1626 1627 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1628 prefix to be used for exploding or expanding information 1629 :type explode_infos_prefix: str 1630 :return: the value of the variable `explode_infos_prefix`. 1631 """ 1632 1633 if not explode_infos_prefix: 1634 explode_infos_prefix = ( 1635 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1636 ) 1637 1638 return explode_infos_prefix 1639 1640 def add_column( 1641 self, 1642 table_name, 1643 column_name, 1644 column_type, 1645 default_value=None, 1646 drop: bool = False, 1647 ) -> dict: 1648 """ 1649 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1650 doesn't already exist. 1651 1652 :param table_name: The name of the table to which you want to add a column 1653 :param column_name: The parameter "column_name" is the name of the column that you want to add 1654 to the table 1655 :param column_type: The `column_type` parameter specifies the data type of the column that you 1656 want to add to the table. It should be a string that represents the desired data type, such as 1657 "INTEGER", "TEXT", "REAL", etc 1658 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1659 default value for the newly added column. If a default value is provided, it will be assigned to 1660 the column for any existing rows that do not have a value for that column 1661 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1662 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1663 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1664 to False 1665 :type drop: bool (optional) 1666 :return: a boolean value indicating whether the column was successfully added to the table. 1667 """ 1668 1669 # added 1670 added = False 1671 dropped = False 1672 1673 # Check if the column already exists in the table 1674 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1675 columns = self.get_query_to_df(query).columns.tolist() 1676 if column_name.upper() in [c.upper() for c in columns]: 1677 log.debug( 1678 f"The {column_name} column already exists in the {table_name} table" 1679 ) 1680 if drop: 1681 self.drop_column(table_name=table_name, column_name=column_name) 1682 dropped = True 1683 else: 1684 return None 1685 else: 1686 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1687 1688 # Add column in table 1689 add_column_query = ( 1690 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1691 ) 1692 if default_value is not None: 1693 add_column_query += f" DEFAULT {default_value}" 1694 self.execute_query(add_column_query) 1695 added = not dropped 1696 log.debug( 1697 f"The {column_name} column was successfully added to the {table_name} table" 1698 ) 1699 1700 if added: 1701 added_column = { 1702 "table_name": table_name, 1703 "column_name": column_name, 1704 "column_type": column_type, 1705 "default_value": default_value, 1706 } 1707 else: 1708 added_column = None 1709 1710 return added_column 1711 1712 def drop_column( 1713 self, column: dict = None, table_name: str = None, column_name: str = None 1714 ) -> bool: 1715 """ 1716 The `drop_column` function drops a specified column from a given table in a database and returns 1717 True if the column was successfully dropped, and False if the column does not exist in the 1718 table. 1719 1720 :param column: The `column` parameter is a dictionary that contains information about the column 1721 you want to drop. It has two keys: 1722 :type column: dict 1723 :param table_name: The `table_name` parameter is the name of the table from which you want to 1724 drop a column 1725 :type table_name: str 1726 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1727 from the table 1728 :type column_name: str 1729 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1730 and False if the column does not exist in the table. 1731 """ 1732 1733 # Find column infos 1734 if column: 1735 if isinstance(column, dict): 1736 table_name = column.get("table_name", None) 1737 column_name = column.get("column_name", None) 1738 elif isinstance(column, str): 1739 table_name = self.get_table_variants() 1740 column_name = column 1741 else: 1742 table_name = None 1743 column_name = None 1744 1745 if not table_name and not column_name: 1746 return False 1747 1748 # Removed 1749 removed = False 1750 1751 # Check if the column already exists in the table 1752 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1753 columns = self.get_query_to_df(query).columns.tolist() 1754 if column_name in columns: 1755 log.debug(f"The {column_name} column exists in the {table_name} table") 1756 else: 1757 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1758 return False 1759 1760 # Add column in table # ALTER TABLE integers DROP k 1761 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1762 self.execute_query(add_column_query) 1763 removed = True 1764 log.debug( 1765 f"The {column_name} column was successfully dropped to the {table_name} table" 1766 ) 1767 1768 return removed 1769 1770 def explode_infos( 1771 self, 1772 prefix: str = None, 1773 create_index: bool = False, 1774 fields: list = None, 1775 force: bool = False, 1776 proccess_all_fields_together: bool = False, 1777 table: str = None, 1778 ) -> list: 1779 """ 1780 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1781 individual columns, returning a list of added columns. 1782 1783 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1784 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1785 `self.get_explode_infos_prefix()` as the prefix 1786 :type prefix: str 1787 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1788 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1789 `False`, indexes will not be created. The default value is `False`, defaults to False 1790 :type create_index: bool (optional) 1791 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1792 that you want to explode into individual columns. If this parameter is not provided, all INFO 1793 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1794 a list to the ` 1795 :type fields: list 1796 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1797 determines whether to drop and recreate a column if it already exists in the table. If `force` 1798 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1799 defaults to False 1800 :type force: bool (optional) 1801 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1802 flag that determines whether to process all the INFO fields together or individually. If set to 1803 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1804 be processed individually. The default value is, defaults to False 1805 :type proccess_all_fields_together: bool (optional) 1806 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1807 of the table where the exploded INFO fields will be added as individual columns. If you provide 1808 a value for the `table` parameter, the function will use that table name. If the `table` 1809 parameter is 1810 :type table: str 1811 :return: The `explode_infos` function returns a list of added columns. 1812 """ 1813 1814 # drop indexes 1815 self.drop_indexes() 1816 1817 # connexion format 1818 connexion_format = self.get_connexion_format() 1819 1820 # Access 1821 access = self.get_config().get("access", None) 1822 1823 # Added columns 1824 added_columns = [] 1825 1826 if access not in ["RO"]: 1827 1828 # prefix 1829 if prefix in [None, True] or not isinstance(prefix, str): 1830 if self.get_explode_infos_prefix() not in [None, True]: 1831 prefix = self.get_explode_infos_prefix() 1832 else: 1833 prefix = "INFO/" 1834 1835 # table variants 1836 if table is not None: 1837 table_variants = table 1838 else: 1839 table_variants = self.get_table_variants(clause="select") 1840 1841 # extra infos 1842 try: 1843 extra_infos = self.get_extra_infos() 1844 except: 1845 extra_infos = [] 1846 1847 # Header infos 1848 header_infos = self.get_header().infos 1849 1850 log.debug( 1851 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1852 ) 1853 1854 sql_info_alter_table_array = [] 1855 1856 # Info fields to check 1857 fields_list = list(header_infos) 1858 if fields: 1859 fields_list += fields 1860 fields_list = set(fields_list) 1861 1862 # If no fields 1863 if not fields: 1864 fields = [] 1865 1866 # Translate fields if patterns 1867 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1868 1869 for info in fields: 1870 1871 info_id_sql = prefix + info 1872 1873 if ( 1874 info in fields_list 1875 or prefix + info in fields_list 1876 or info in extra_infos 1877 ): 1878 1879 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1880 1881 if info in header_infos: 1882 info_type = header_infos[info].type 1883 info_num = header_infos[info].num 1884 else: 1885 info_type = "String" 1886 info_num = 0 1887 1888 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1889 if info_num != 1: 1890 type_sql = "VARCHAR" 1891 1892 # Add field 1893 added_column = self.add_column( 1894 table_name=table_variants, 1895 column_name=info_id_sql, 1896 column_type=type_sql, 1897 default_value="null", 1898 drop=force, 1899 ) 1900 1901 if added_column: 1902 added_columns.append(added_column) 1903 1904 if added_column or force: 1905 1906 # add field to index 1907 self.index_additionnal_fields.append(info_id_sql) 1908 1909 # Update field array 1910 if connexion_format in ["duckdb"]: 1911 update_info_field = f""" 1912 "{info_id_sql}" = 1913 CASE 1914 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1915 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1916 END 1917 """ 1918 elif connexion_format in ["sqlite"]: 1919 update_info_field = f""" 1920 "{info_id_sql}" = 1921 CASE 1922 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1923 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1924 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1925 END 1926 """ 1927 1928 sql_info_alter_table_array.append(update_info_field) 1929 1930 if sql_info_alter_table_array: 1931 1932 # By chromosomes 1933 try: 1934 chromosomes_list = list( 1935 self.get_query_to_df( 1936 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1937 )["#CHROM"] 1938 ) 1939 except: 1940 chromosomes_list = [None] 1941 1942 for chrom in chromosomes_list: 1943 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1944 1945 # Where clause 1946 where_clause = "" 1947 if chrom and len(chromosomes_list) > 1: 1948 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1949 1950 # Update table 1951 if proccess_all_fields_together: 1952 sql_info_alter_table_array_join = ", ".join( 1953 sql_info_alter_table_array 1954 ) 1955 if sql_info_alter_table_array_join: 1956 sql_info_alter_table = f""" 1957 UPDATE {table_variants} 1958 SET {sql_info_alter_table_array_join} 1959 {where_clause} 1960 """ 1961 log.debug( 1962 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1963 ) 1964 # log.debug(sql_info_alter_table) 1965 self.conn.execute(sql_info_alter_table) 1966 else: 1967 sql_info_alter_num = 0 1968 for sql_info_alter in sql_info_alter_table_array: 1969 sql_info_alter_num += 1 1970 sql_info_alter_table = f""" 1971 UPDATE {table_variants} 1972 SET {sql_info_alter} 1973 {where_clause} 1974 """ 1975 log.debug( 1976 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1977 ) 1978 # log.debug(sql_info_alter_table) 1979 self.conn.execute(sql_info_alter_table) 1980 1981 # create indexes 1982 if create_index: 1983 self.create_indexes() 1984 1985 return added_columns 1986 1987 def create_indexes(self) -> None: 1988 """ 1989 Create indexes on the table after insertion 1990 """ 1991 1992 # Access 1993 access = self.get_config().get("access", None) 1994 1995 # get table variants 1996 table_variants = self.get_table_variants("FROM") 1997 1998 if self.get_indexing() and access not in ["RO"]: 1999 # Create index 2000 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2001 self.conn.execute(sql_create_table_index) 2002 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2003 self.conn.execute(sql_create_table_index) 2004 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2005 self.conn.execute(sql_create_table_index) 2006 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2007 self.conn.execute(sql_create_table_index) 2008 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2009 self.conn.execute(sql_create_table_index) 2010 for field in self.index_additionnal_fields: 2011 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2012 self.conn.execute(sql_create_table_index) 2013 2014 def drop_indexes(self) -> None: 2015 """ 2016 Create indexes on the table after insertion 2017 """ 2018 2019 # Access 2020 access = self.get_config().get("access", None) 2021 2022 # get table variants 2023 table_variants = self.get_table_variants("FROM") 2024 2025 # Get database format 2026 connexion_format = self.get_connexion_format() 2027 2028 if access not in ["RO"]: 2029 if connexion_format in ["duckdb"]: 2030 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2031 elif connexion_format in ["sqlite"]: 2032 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2033 2034 list_indexes = self.conn.execute(sql_list_indexes) 2035 index_names = [row[0] for row in list_indexes.fetchall()] 2036 for index in index_names: 2037 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2038 self.conn.execute(sql_drop_table_index) 2039 2040 def read_vcf_header(self, f) -> list: 2041 """ 2042 It reads the header of a VCF file and returns a list of the header lines 2043 2044 :param f: the file object 2045 :return: The header lines of the VCF file. 2046 """ 2047 2048 header_list = [] 2049 for line in f: 2050 header_list.append(line) 2051 if line.startswith("#CHROM"): 2052 break 2053 return header_list 2054 2055 def read_vcf_header_file(self, file: str = None) -> list: 2056 """ 2057 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2058 uncompressed files. 2059 2060 :param file: The `file` parameter is a string that represents the path to the VCF header file 2061 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2062 default to `None` 2063 :type file: str 2064 :return: The function `read_vcf_header_file` returns a list. 2065 """ 2066 2067 if self.get_input_compressed(input_file=file): 2068 with bgzf.open(file, "rt") as f: 2069 return self.read_vcf_header(f=f) 2070 else: 2071 with open(file, "rt") as f: 2072 return self.read_vcf_header(f=f) 2073 2074 def execute_query(self, query: str): 2075 """ 2076 It takes a query as an argument, executes it, and returns the results 2077 2078 :param query: The query to be executed 2079 :return: The result of the query is being returned. 2080 """ 2081 if query: 2082 return self.conn.execute(query) # .fetchall() 2083 else: 2084 return None 2085 2086 def export_output( 2087 self, 2088 output_file: str | None = None, 2089 output_header: str | None = None, 2090 export_header: bool = True, 2091 query: str | None = None, 2092 parquet_partitions: list | None = None, 2093 chunk_size: int | None = None, 2094 threads: int | None = None, 2095 sort: bool = False, 2096 index: bool = False, 2097 order_by: str | None = None, 2098 ) -> bool: 2099 """ 2100 The `export_output` function exports data from a VCF file to a specified output file in various 2101 formats, including VCF, CSV, TSV, PSV, and Parquet. 2102 2103 :param output_file: The `output_file` parameter is a string that specifies the name of the 2104 output file to be generated by the function. This is where the exported data will be saved 2105 :type output_file: str 2106 :param output_header: The `output_header` parameter is a string that specifies the name of the 2107 file where the header of the VCF file will be exported. If this parameter is not provided, the 2108 header will be exported to a file with the same name as the `output_file` parameter, but with 2109 the extension " 2110 :type output_header: str 2111 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2112 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2113 True, the header will be exported to a file. If `export_header` is False, the header will not 2114 be, defaults to True, if output format is not VCF 2115 :type export_header: bool (optional) 2116 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2117 select specific data from the VCF file before exporting it. If provided, only the data that 2118 matches the query will be exported 2119 :type query: str 2120 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2121 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2122 organize data in a hierarchical directory structure based on the values of one or more columns. 2123 This can improve query performance when working with large datasets 2124 :type parquet_partitions: list 2125 :param chunk_size: The `chunk_size` parameter specifies the number of 2126 records in batch when exporting data in Parquet format. This parameter is used for 2127 partitioning the Parquet file into multiple files. 2128 :type chunk_size: int 2129 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2130 threads to be used during the export process. It determines the level of parallelism and can 2131 improve the performance of the export operation. If not provided, the function will use the 2132 default number of threads 2133 :type threads: int 2134 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2135 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2136 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2137 False 2138 :type sort: bool (optional) 2139 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2140 created on the output file. If `index` is True, an index will be created. If `index` is False, 2141 no index will be created. The default value is False, defaults to False 2142 :type index: bool (optional) 2143 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2144 sorting the output file. This parameter is only applicable when exporting data in VCF format 2145 :type order_by: str 2146 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2147 None if it doesn't. 2148 """ 2149 2150 # Log 2151 log.info("Exporting...") 2152 2153 # Full path 2154 output_file = full_path(output_file) 2155 output_header = full_path(output_header) 2156 2157 # Config 2158 config = self.get_config() 2159 2160 # Param 2161 param = self.get_param() 2162 2163 # Tmp files to remove 2164 tmp_to_remove = [] 2165 2166 # If no output, get it 2167 if not output_file: 2168 output_file = self.get_output() 2169 2170 # If not threads 2171 if not threads: 2172 threads = self.get_threads() 2173 2174 # Auto header name with extension 2175 if export_header or output_header: 2176 if not output_header: 2177 output_header = f"{output_file}.hdr" 2178 # Export header 2179 self.export_header(output_file=output_file) 2180 2181 # Switch off export header if VCF output 2182 output_file_type = get_file_format(output_file) 2183 if output_file_type in ["vcf"]: 2184 export_header = False 2185 tmp_to_remove.append(output_header) 2186 2187 # Chunk size 2188 if not chunk_size: 2189 chunk_size = config.get("chunk_size", None) 2190 2191 # Parquet partition 2192 if not parquet_partitions: 2193 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2194 if parquet_partitions and isinstance(parquet_partitions, str): 2195 parquet_partitions = parquet_partitions.split(",") 2196 2197 # Order by 2198 if not order_by: 2199 order_by = param.get("export", {}).get("order_by", "") 2200 2201 # Header in output 2202 header_in_output = param.get("export", {}).get("include_header", False) 2203 2204 # Database 2205 database_source = self.get_connexion() 2206 2207 # Connexion format 2208 connexion_format = self.get_connexion_format() 2209 2210 # Explode infos 2211 if self.get_explode_infos(): 2212 self.explode_infos( 2213 prefix=self.get_explode_infos_prefix(), 2214 fields=self.get_explode_infos_fields(), 2215 force=False, 2216 ) 2217 2218 # if connexion_format in ["sqlite"] or query: 2219 if connexion_format in ["sqlite"]: 2220 2221 # Export in Parquet 2222 random_tmp = "".join( 2223 random.choice(string.ascii_lowercase) for i in range(10) 2224 ) 2225 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2226 tmp_to_remove.append(database_source) 2227 2228 # Table Variants 2229 table_variants = self.get_table_variants() 2230 2231 # Create export query 2232 sql_query_export_subquery = f""" 2233 SELECT * FROM {table_variants} 2234 """ 2235 2236 # Write source file 2237 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2238 2239 # Create database 2240 database = Database( 2241 database=database_source, 2242 table="variants", 2243 header_file=output_header, 2244 conn_config=self.get_connexion_config(), 2245 ) 2246 2247 # Existing colomns header 2248 existing_columns_header = database.get_header_columns_from_database(query=query) 2249 2250 # Sample list 2251 if output_file_type in ["vcf"]: 2252 get_samples = self.get_samples() 2253 get_samples_check = self.get_samples_check() 2254 samples_force = get_samples is not None 2255 sample_list = self.get_header_sample_list( 2256 check=get_samples_check, 2257 samples=get_samples, 2258 samples_force=samples_force, 2259 ) 2260 else: 2261 sample_list = None 2262 2263 # Export file 2264 database.export( 2265 output_database=output_file, 2266 output_header=output_header, 2267 existing_columns_header=existing_columns_header, 2268 parquet_partitions=parquet_partitions, 2269 chunk_size=chunk_size, 2270 threads=threads, 2271 sort=sort, 2272 index=index, 2273 header_in_output=header_in_output, 2274 order_by=order_by, 2275 query=query, 2276 export_header=export_header, 2277 sample_list=sample_list, 2278 ) 2279 2280 # Remove 2281 remove_if_exists(tmp_to_remove) 2282 2283 return (os.path.exists(output_file) or None) and ( 2284 os.path.exists(output_file) or None 2285 ) 2286 2287 def get_extra_infos(self, table: str = None) -> list: 2288 """ 2289 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2290 in the header. 2291 2292 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2293 name of the table from which you want to retrieve the extra columns that are not present in the 2294 header. If the `table` parameter is not provided when calling the function, it will default to 2295 using the variants 2296 :type table: str 2297 :return: A list of columns that are in the specified table but not in the header of the table. 2298 """ 2299 2300 header_columns = [] 2301 2302 if not table: 2303 table = self.get_table_variants(clause="from") 2304 header_columns = self.get_header_columns() 2305 2306 # Check all columns in the database 2307 query = f""" SELECT * FROM {table} LIMIT 1 """ 2308 log.debug(f"query {query}") 2309 table_columns = self.get_query_to_df(query).columns.tolist() 2310 extra_columns = [] 2311 2312 # Construct extra infos (not in header) 2313 for column in table_columns: 2314 if column not in header_columns: 2315 extra_columns.append(column) 2316 2317 return extra_columns 2318 2319 def get_extra_infos_sql(self, table: str = None) -> str: 2320 """ 2321 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2322 by double quotes 2323 2324 :param table: The name of the table to get the extra infos from. If None, the default table is 2325 used 2326 :type table: str 2327 :return: A string of the extra infos 2328 """ 2329 2330 return ", ".join( 2331 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2332 ) 2333 2334 def export_header( 2335 self, 2336 header_name: str = None, 2337 output_file: str = None, 2338 output_file_ext: str = ".hdr", 2339 clean_header: bool = True, 2340 remove_chrom_line: bool = False, 2341 ) -> str: 2342 """ 2343 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2344 specified options, and writes it to a new file. 2345 2346 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2347 this parameter is not specified, the header will be written to the output file 2348 :type header_name: str 2349 :param output_file: The `output_file` parameter in the `export_header` function is used to 2350 specify the name of the output file where the header will be written. If this parameter is not 2351 provided, the header will be written to a temporary file 2352 :type output_file: str 2353 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2354 string that represents the extension of the output header file. By default, it is set to ".hdr" 2355 if not specified by the user. This extension will be appended to the `output_file` name to 2356 create the final, defaults to .hdr 2357 :type output_file_ext: str (optional) 2358 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2359 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2360 `True`, the function will clean the header by modifying certain lines based on a specific 2361 pattern. If `clean_header`, defaults to True 2362 :type clean_header: bool (optional) 2363 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2364 boolean flag that determines whether the #CHROM line should be removed from the header before 2365 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2366 defaults to False 2367 :type remove_chrom_line: bool (optional) 2368 :return: The function `export_header` returns the name of the temporary header file that is 2369 created. 2370 """ 2371 2372 if not header_name and not output_file: 2373 output_file = self.get_output() 2374 2375 if self.get_header(): 2376 2377 # Get header object 2378 header_obj = self.get_header() 2379 2380 # Create database 2381 db_for_header = Database(database=self.get_input()) 2382 2383 # Get real columns in the file 2384 db_header_columns = db_for_header.get_columns() 2385 2386 with tempfile.TemporaryDirectory() as tmpdir: 2387 2388 # Write header file 2389 header_file_tmp = os.path.join(tmpdir, "header") 2390 f = open(header_file_tmp, "w") 2391 vcf.Writer(f, header_obj) 2392 f.close() 2393 2394 # Replace #CHROM line with rel columns 2395 header_list = db_for_header.read_header_file( 2396 header_file=header_file_tmp 2397 ) 2398 header_list[-1] = "\t".join(db_header_columns) 2399 2400 # Remove CHROM line 2401 if remove_chrom_line: 2402 header_list.pop() 2403 2404 # Clean header 2405 if clean_header: 2406 header_list_clean = [] 2407 for head in header_list: 2408 # Clean head for malformed header 2409 head_clean = head 2410 head_clean = re.subn( 2411 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2412 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2413 head_clean, 2414 2, 2415 )[0] 2416 # Write header 2417 header_list_clean.append(head_clean) 2418 header_list = header_list_clean 2419 2420 tmp_header_name = output_file + output_file_ext 2421 2422 f = open(tmp_header_name, "w") 2423 for line in header_list: 2424 f.write(line) 2425 f.close() 2426 2427 return tmp_header_name 2428 2429 def export_variant_vcf( 2430 self, 2431 vcf_file, 2432 remove_info: bool = False, 2433 add_samples: bool = True, 2434 list_samples: list = [], 2435 where_clause: str = "", 2436 index: bool = False, 2437 threads: int | None = None, 2438 ) -> bool | None: 2439 """ 2440 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2441 remove INFO field, add samples, and control compression and indexing. 2442 2443 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2444 written to. It is the output file that will contain the filtered VCF data based on the specified 2445 parameters 2446 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2447 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2448 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2449 in, defaults to False 2450 :type remove_info: bool (optional) 2451 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2452 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2453 If set to False, the samples will be removed. The default value is True, defaults to True 2454 :type add_samples: bool (optional) 2455 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2456 in the output VCF file. By default, all samples will be included. If you provide a list of 2457 samples, only those samples will be included in the output file 2458 :type list_samples: list 2459 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2460 determines whether or not to create an index for the output VCF file. If `index` is set to 2461 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2462 :type index: bool (optional) 2463 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2464 number of threads to use for exporting the VCF file. It determines how many parallel threads 2465 will be used during the export process. More threads can potentially speed up the export process 2466 by utilizing multiple cores of the processor. If 2467 :type threads: int | None 2468 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2469 method with various parameters including the output file, query, threads, sort flag, and index 2470 flag. The `export_output` method is responsible for exporting the VCF data based on the 2471 specified parameters and configurations provided in the `export_variant_vcf` function. 2472 """ 2473 2474 # Config 2475 config = self.get_config() 2476 2477 # Extract VCF 2478 log.debug("Export VCF...") 2479 2480 # Table variants 2481 table_variants = self.get_table_variants() 2482 2483 # Threads 2484 if not threads: 2485 threads = self.get_threads() 2486 2487 # Info fields 2488 if remove_info: 2489 if not isinstance(remove_info, str): 2490 remove_info = "." 2491 info_field = f"""'{remove_info}' as INFO""" 2492 else: 2493 info_field = "INFO" 2494 2495 # Samples fields 2496 if add_samples: 2497 if not list_samples: 2498 list_samples = self.get_header_sample_list() 2499 if list_samples: 2500 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2501 else: 2502 samples_fields = "" 2503 log.debug(f"samples_fields: {samples_fields}") 2504 else: 2505 samples_fields = "" 2506 2507 # Where clause 2508 if where_clause is None: 2509 where_clause = "" 2510 2511 # Variants 2512 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2513 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2514 log.debug(f"sql_query_select={sql_query_select}") 2515 2516 return self.export_output( 2517 output_file=vcf_file, 2518 output_header=None, 2519 export_header=True, 2520 query=sql_query_select, 2521 parquet_partitions=None, 2522 chunk_size=config.get("chunk_size", None), 2523 threads=threads, 2524 sort=True, 2525 index=index, 2526 order_by=None, 2527 ) 2528 2529 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2530 """ 2531 It takes a list of commands and runs them in parallel using the number of threads specified 2532 2533 :param commands: A list of commands to run 2534 :param threads: The number of threads to use, defaults to 1 (optional) 2535 """ 2536 2537 run_parallel_commands(commands, threads) 2538 2539 def get_threads(self, default: int = 1) -> int: 2540 """ 2541 This function returns the number of threads to use for a job, with a default value of 1 if not 2542 specified. 2543 2544 :param default: The `default` parameter in the `get_threads` method is used to specify the 2545 default number of threads to use if no specific value is provided. If no value is provided for 2546 the `threads` parameter in the configuration or input parameters, the `default` value will be 2547 used, defaults to 1 2548 :type default: int (optional) 2549 :return: the number of threads to use for the current job. 2550 """ 2551 2552 # Config 2553 config = self.get_config() 2554 2555 # Param 2556 param = self.get_param() 2557 2558 # Input threads 2559 input_thread = param.get("threads", config.get("threads", None)) 2560 2561 # Check threads 2562 if not input_thread: 2563 threads = default 2564 elif int(input_thread) <= 0: 2565 threads = os.cpu_count() 2566 else: 2567 threads = int(input_thread) 2568 return threads 2569 2570 def get_memory(self, default: str = None) -> str: 2571 """ 2572 This function retrieves the memory value from parameters or configuration with a default value 2573 if not found. 2574 2575 :param default: The `get_memory` function takes in a default value as a string parameter. This 2576 default value is used as a fallback in case the `memory` parameter is not provided in the 2577 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2578 the function 2579 :type default: str 2580 :return: The `get_memory` function returns a string value representing the memory parameter. If 2581 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2582 return the default value provided as an argument to the function. 2583 """ 2584 2585 # Config 2586 config = self.get_config() 2587 2588 # Param 2589 param = self.get_param() 2590 2591 # Input threads 2592 input_memory = param.get("memory", config.get("memory", None)) 2593 2594 # Check threads 2595 if input_memory: 2596 memory = input_memory 2597 else: 2598 memory = default 2599 2600 return memory 2601 2602 def update_from_vcf(self, vcf_file: str) -> None: 2603 """ 2604 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2605 2606 :param vcf_file: the path to the VCF file 2607 """ 2608 2609 connexion_format = self.get_connexion_format() 2610 2611 if connexion_format in ["duckdb"]: 2612 self.update_from_vcf_duckdb(vcf_file) 2613 elif connexion_format in ["sqlite"]: 2614 self.update_from_vcf_sqlite(vcf_file) 2615 2616 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2617 """ 2618 It takes a VCF file and updates the INFO column of the variants table in the database with the 2619 INFO column of the VCF file 2620 2621 :param vcf_file: the path to the VCF file 2622 """ 2623 2624 # varaints table 2625 table_variants = self.get_table_variants() 2626 2627 # Loading VCF into temporaire table 2628 skip = self.get_header_length(file=vcf_file) 2629 vcf_df = pd.read_csv( 2630 vcf_file, 2631 sep="\t", 2632 engine="c", 2633 skiprows=skip, 2634 header=0, 2635 low_memory=False, 2636 ) 2637 sql_query_update = f""" 2638 UPDATE {table_variants} as table_variants 2639 SET INFO = concat( 2640 CASE 2641 WHEN INFO NOT IN ('', '.') 2642 THEN INFO 2643 ELSE '' 2644 END, 2645 ( 2646 SELECT 2647 concat( 2648 CASE 2649 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2650 THEN ';' 2651 ELSE '' 2652 END 2653 , 2654 CASE 2655 WHEN table_parquet.INFO NOT IN ('','.') 2656 THEN table_parquet.INFO 2657 ELSE '' 2658 END 2659 ) 2660 FROM vcf_df as table_parquet 2661 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2662 AND table_parquet.\"POS\" = table_variants.\"POS\" 2663 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2664 AND table_parquet.\"REF\" = table_variants.\"REF\" 2665 AND table_parquet.INFO NOT IN ('','.') 2666 ) 2667 ) 2668 ; 2669 """ 2670 self.conn.execute(sql_query_update) 2671 2672 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2673 """ 2674 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2675 table, then updates the INFO column of the variants table with the INFO column of the temporary 2676 table 2677 2678 :param vcf_file: The path to the VCF file you want to update the database with 2679 """ 2680 2681 # Create a temporary table for the VCF 2682 table_vcf = "tmp_vcf" 2683 sql_create = ( 2684 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2685 ) 2686 self.conn.execute(sql_create) 2687 2688 # Loading VCF into temporaire table 2689 vcf_df = pd.read_csv( 2690 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2691 ) 2692 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2693 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2694 2695 # Update table 'variants' with VCF data 2696 # warning: CONCAT as || operator 2697 sql_query_update = f""" 2698 UPDATE variants as table_variants 2699 SET INFO = CASE 2700 WHEN INFO NOT IN ('', '.') 2701 THEN INFO 2702 ELSE '' 2703 END || 2704 ( 2705 SELECT 2706 CASE 2707 WHEN table_variants.INFO NOT IN ('','.') 2708 AND table_vcf.INFO NOT IN ('','.') 2709 THEN ';' 2710 ELSE '' 2711 END || 2712 CASE 2713 WHEN table_vcf.INFO NOT IN ('','.') 2714 THEN table_vcf.INFO 2715 ELSE '' 2716 END 2717 FROM {table_vcf} as table_vcf 2718 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2719 AND table_vcf.\"POS\" = table_variants.\"POS\" 2720 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2721 AND table_vcf.\"REF\" = table_variants.\"REF\" 2722 ) 2723 """ 2724 self.conn.execute(sql_query_update) 2725 2726 # Drop temporary table 2727 sql_drop = f"DROP TABLE {table_vcf}" 2728 self.conn.execute(sql_drop) 2729 2730 def drop_variants_table(self) -> None: 2731 """ 2732 > This function drops the variants table 2733 """ 2734 2735 table_variants = self.get_table_variants() 2736 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2737 self.conn.execute(sql_table_variants) 2738 2739 def set_variant_id( 2740 self, variant_id_column: str = "variant_id", force: bool = None 2741 ) -> str: 2742 """ 2743 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2744 `#CHROM`, `POS`, `REF`, and `ALT` columns 2745 2746 :param variant_id_column: The name of the column to be created in the variants table, defaults 2747 to variant_id 2748 :type variant_id_column: str (optional) 2749 :param force: If True, the variant_id column will be created even if it already exists 2750 :type force: bool 2751 :return: The name of the column that contains the variant_id 2752 """ 2753 2754 # Assembly 2755 assembly = self.get_param().get( 2756 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2757 ) 2758 2759 # INFO/Tag prefix 2760 prefix = self.get_explode_infos_prefix() 2761 2762 # Explode INFO/SVTYPE 2763 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2764 2765 # variants table 2766 table_variants = self.get_table_variants() 2767 2768 # variant_id column 2769 if not variant_id_column: 2770 variant_id_column = "variant_id" 2771 2772 # Creta variant_id column 2773 if "variant_id" not in self.get_extra_infos() or force: 2774 2775 # Create column 2776 self.add_column( 2777 table_name=table_variants, 2778 column_name=variant_id_column, 2779 column_type="UBIGINT", 2780 default_value="0", 2781 ) 2782 2783 # Update column 2784 self.conn.execute( 2785 f""" 2786 UPDATE {table_variants} 2787 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2788 """ 2789 ) 2790 2791 # Remove added columns 2792 for added_column in added_columns: 2793 self.drop_column(column=added_column) 2794 2795 # return variant_id column name 2796 return variant_id_column 2797 2798 def get_variant_id_column( 2799 self, variant_id_column: str = "variant_id", force: bool = None 2800 ) -> str: 2801 """ 2802 This function returns the variant_id column name 2803 2804 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2805 defaults to variant_id 2806 :type variant_id_column: str (optional) 2807 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2808 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2809 if it is not already set, or if it is set 2810 :type force: bool 2811 :return: The variant_id column name. 2812 """ 2813 2814 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2815 2816 ### 2817 # Annotation 2818 ### 2819 2820 def scan_databases( 2821 self, 2822 database_formats: list = ["parquet"], 2823 database_releases: list = ["current"], 2824 ) -> dict: 2825 """ 2826 The function `scan_databases` scans for available databases based on specified formats and 2827 releases. 2828 2829 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2830 of the databases to be scanned. In this case, the accepted format is "parquet" 2831 :type database_formats: list ["parquet"] 2832 :param database_releases: The `database_releases` parameter is a list that specifies the 2833 releases of the databases to be scanned. In the provided function, the default value for 2834 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2835 databases that are in the "current" 2836 :type database_releases: list 2837 :return: The function `scan_databases` returns a dictionary containing information about 2838 databases that match the specified formats and releases. 2839 """ 2840 2841 # Config 2842 config = self.get_config() 2843 2844 # Param 2845 param = self.get_param() 2846 2847 # Param - Assembly 2848 assembly = param.get("assembly", config.get("assembly", None)) 2849 if not assembly: 2850 assembly = DEFAULT_ASSEMBLY 2851 log.warning(f"Default assembly '{assembly}'") 2852 2853 # Scan for availabled databases 2854 log.info( 2855 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2856 ) 2857 databases_infos_dict = databases_infos( 2858 database_folder_releases=database_releases, 2859 database_formats=database_formats, 2860 assembly=assembly, 2861 config=config, 2862 ) 2863 log.info( 2864 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2865 ) 2866 2867 return databases_infos_dict 2868 2869 def annotation(self) -> None: 2870 """ 2871 It annotates the VCF file with the annotations specified in the config file. 2872 """ 2873 2874 # Config 2875 config = self.get_config() 2876 2877 # Param 2878 param = self.get_param() 2879 2880 # Param - Assembly 2881 assembly = param.get("assembly", config.get("assembly", None)) 2882 if not assembly: 2883 assembly = DEFAULT_ASSEMBLY 2884 log.warning(f"Default assembly '{assembly}'") 2885 2886 # annotations databases folders 2887 annotations_databases = set( 2888 config.get("folders", {}) 2889 .get("databases", {}) 2890 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2891 + config.get("folders", {}) 2892 .get("databases", {}) 2893 .get("parquet", ["~/howard/databases/parquet/current"]) 2894 + config.get("folders", {}) 2895 .get("databases", {}) 2896 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2897 ) 2898 2899 # Get param annotations 2900 if param.get("annotations", None) and isinstance( 2901 param.get("annotations", None), str 2902 ): 2903 log.debug(param.get("annotations", None)) 2904 param_annotation_list = param.get("annotations").split(",") 2905 else: 2906 param_annotation_list = [] 2907 2908 # Each tools param 2909 if param.get("annotation_parquet", None) != None: 2910 log.debug( 2911 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2912 ) 2913 if isinstance(param.get("annotation_parquet", None), list): 2914 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2915 else: 2916 param_annotation_list.append(param.get("annotation_parquet")) 2917 if param.get("annotation_snpsift", None) != None: 2918 if isinstance(param.get("annotation_snpsift", None), list): 2919 param_annotation_list.append( 2920 "snpsift:" 2921 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2922 ) 2923 else: 2924 param_annotation_list.append( 2925 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2926 ) 2927 if param.get("annotation_snpeff", None) != None: 2928 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2929 if param.get("annotation_bcftools", None) != None: 2930 if isinstance(param.get("annotation_bcftools", None), list): 2931 param_annotation_list.append( 2932 "bcftools:" 2933 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2934 ) 2935 else: 2936 param_annotation_list.append( 2937 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2938 ) 2939 if param.get("annotation_annovar", None) != None: 2940 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2941 if param.get("annotation_exomiser", None) != None: 2942 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2943 if param.get("annotation_splice", None) != None: 2944 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2945 2946 # Merge param annotations list 2947 param["annotations"] = ",".join(param_annotation_list) 2948 2949 # debug 2950 log.debug(f"param_annotations={param['annotations']}") 2951 2952 if param.get("annotations"): 2953 2954 # Log 2955 # log.info("Annotations - Check annotation parameters") 2956 2957 if not "annotation" in param: 2958 param["annotation"] = {} 2959 2960 # List of annotations parameters 2961 annotations_list_input = {} 2962 if isinstance(param.get("annotations", None), str): 2963 annotation_file_list = [ 2964 value for value in param.get("annotations", "").split(",") 2965 ] 2966 for annotation_file in annotation_file_list: 2967 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2968 else: 2969 annotations_list_input = param.get("annotations", {}) 2970 2971 log.info(f"Quick Annotations:") 2972 for annotation_key in list(annotations_list_input.keys()): 2973 log.info(f" {annotation_key}") 2974 2975 # List of annotations and associated fields 2976 annotations_list = {} 2977 2978 for annotation_file in annotations_list_input: 2979 2980 # Explode annotations if ALL 2981 if ( 2982 annotation_file.upper() == "ALL" 2983 or annotation_file.upper().startswith("ALL:") 2984 ): 2985 2986 # check ALL parameters (formats, releases) 2987 annotation_file_split = annotation_file.split(":") 2988 database_formats = "parquet" 2989 database_releases = "current" 2990 for annotation_file_option in annotation_file_split[1:]: 2991 database_all_options_split = annotation_file_option.split("=") 2992 if database_all_options_split[0] == "format": 2993 database_formats = database_all_options_split[1].split("+") 2994 if database_all_options_split[0] == "release": 2995 database_releases = database_all_options_split[1].split("+") 2996 2997 # Scan for availabled databases 2998 databases_infos_dict = self.scan_databases( 2999 database_formats=database_formats, 3000 database_releases=database_releases, 3001 ) 3002 3003 # Add found databases in annotation parameters 3004 for database_infos in databases_infos_dict.keys(): 3005 annotations_list[database_infos] = {"INFO": None} 3006 3007 else: 3008 annotations_list[annotation_file] = annotations_list_input[ 3009 annotation_file 3010 ] 3011 3012 # Check each databases 3013 if len(annotations_list): 3014 3015 log.info( 3016 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3017 ) 3018 3019 for annotation_file in annotations_list: 3020 3021 # Init 3022 annotations = annotations_list.get(annotation_file, None) 3023 3024 # Annotation snpEff 3025 if annotation_file.startswith("snpeff"): 3026 3027 log.debug(f"Quick Annotation snpEff") 3028 3029 if "snpeff" not in param["annotation"]: 3030 param["annotation"]["snpeff"] = {} 3031 3032 if "options" not in param["annotation"]["snpeff"]: 3033 param["annotation"]["snpeff"]["options"] = "" 3034 3035 # snpEff options in annotations 3036 param["annotation"]["snpeff"]["options"] = "".join( 3037 annotation_file.split(":")[1:] 3038 ) 3039 3040 # Annotation Annovar 3041 elif annotation_file.startswith("annovar"): 3042 3043 log.debug(f"Quick Annotation Annovar") 3044 3045 if "annovar" not in param["annotation"]: 3046 param["annotation"]["annovar"] = {} 3047 3048 if "annotations" not in param["annotation"]["annovar"]: 3049 param["annotation"]["annovar"]["annotations"] = {} 3050 3051 # Options 3052 annotation_file_split = annotation_file.split(":") 3053 for annotation_file_annotation in annotation_file_split[1:]: 3054 if annotation_file_annotation: 3055 param["annotation"]["annovar"]["annotations"][ 3056 annotation_file_annotation 3057 ] = annotations 3058 3059 # Annotation Exomiser 3060 elif annotation_file.startswith("exomiser"): 3061 3062 log.debug(f"Quick Annotation Exomiser") 3063 3064 param["annotation"]["exomiser"] = params_string_to_dict( 3065 annotation_file 3066 ) 3067 3068 # Annotation Splice 3069 elif annotation_file.startswith("splice"): 3070 3071 log.debug(f"Quick Annotation Splice") 3072 3073 param["annotation"]["splice"] = params_string_to_dict( 3074 annotation_file 3075 ) 3076 3077 # Annotation Parquet or BCFTOOLS 3078 else: 3079 3080 # Tools detection 3081 if annotation_file.startswith("bcftools:"): 3082 annotation_tool_initial = "bcftools" 3083 annotation_file = ":".join(annotation_file.split(":")[1:]) 3084 elif annotation_file.startswith("snpsift:"): 3085 annotation_tool_initial = "snpsift" 3086 annotation_file = ":".join(annotation_file.split(":")[1:]) 3087 elif annotation_file.startswith("bigwig:"): 3088 annotation_tool_initial = "bigwig" 3089 annotation_file = ":".join(annotation_file.split(":")[1:]) 3090 else: 3091 annotation_tool_initial = None 3092 3093 # list of files 3094 annotation_file_list = annotation_file.replace("+", ":").split( 3095 ":" 3096 ) 3097 3098 for annotation_file in annotation_file_list: 3099 3100 if annotation_file: 3101 3102 # Annotation tool initial 3103 annotation_tool = annotation_tool_initial 3104 3105 # Find file 3106 annotation_file_found = None 3107 3108 if os.path.exists(annotation_file): 3109 annotation_file_found = annotation_file 3110 elif os.path.exists(full_path(annotation_file)): 3111 annotation_file_found = full_path(annotation_file) 3112 else: 3113 # Find within assembly folders 3114 for annotations_database in annotations_databases: 3115 found_files = find_all( 3116 annotation_file, 3117 os.path.join( 3118 annotations_database, assembly 3119 ), 3120 ) 3121 if len(found_files) > 0: 3122 annotation_file_found = found_files[0] 3123 break 3124 if not annotation_file_found and not assembly: 3125 # Find within folders 3126 for ( 3127 annotations_database 3128 ) in annotations_databases: 3129 found_files = find_all( 3130 annotation_file, annotations_database 3131 ) 3132 if len(found_files) > 0: 3133 annotation_file_found = found_files[0] 3134 break 3135 log.debug( 3136 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3137 ) 3138 3139 # Full path 3140 annotation_file_found = full_path(annotation_file_found) 3141 3142 if annotation_file_found: 3143 3144 database = Database(database=annotation_file_found) 3145 quick_annotation_format = database.get_format() 3146 quick_annotation_is_compressed = ( 3147 database.is_compressed() 3148 ) 3149 quick_annotation_is_indexed = os.path.exists( 3150 f"{annotation_file_found}.tbi" 3151 ) 3152 bcftools_preference = False 3153 3154 # Check Annotation Tool 3155 if not annotation_tool: 3156 if ( 3157 bcftools_preference 3158 and quick_annotation_format 3159 in ["vcf", "bed"] 3160 and quick_annotation_is_compressed 3161 and quick_annotation_is_indexed 3162 ): 3163 annotation_tool = "bcftools" 3164 elif quick_annotation_format in [ 3165 "vcf", 3166 "bed", 3167 "tsv", 3168 "tsv", 3169 "csv", 3170 "json", 3171 "tbl", 3172 "parquet", 3173 "duckdb", 3174 ]: 3175 annotation_tool = "parquet" 3176 elif quick_annotation_format in [ 3177 "bw" 3178 ]: 3179 annotation_tool = "bigwig" 3180 else: 3181 log.error( 3182 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3183 ) 3184 raise ValueError( 3185 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3186 ) 3187 3188 log.debug( 3189 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3190 ) 3191 3192 # Annotation Tool dispatch 3193 if annotation_tool: 3194 if annotation_tool not in param["annotation"]: 3195 param["annotation"][annotation_tool] = {} 3196 if ( 3197 "annotations" 3198 not in param["annotation"][annotation_tool] 3199 ): 3200 param["annotation"][annotation_tool][ 3201 "annotations" 3202 ] = {} 3203 param["annotation"][annotation_tool][ 3204 "annotations" 3205 ][annotation_file_found] = annotations 3206 3207 else: 3208 log.warning( 3209 f"Quick Annotation File {annotation_file} does NOT exist" 3210 ) 3211 3212 self.set_param(param) 3213 3214 if param.get("annotation", None): 3215 log.info("Annotations") 3216 if param.get("annotation", {}).get("parquet", None): 3217 log.info("Annotations 'parquet'...") 3218 self.annotation_parquet() 3219 if param.get("annotation", {}).get("bcftools", None): 3220 log.info("Annotations 'bcftools'...") 3221 self.annotation_bcftools() 3222 if param.get("annotation", {}).get("snpsift", None): 3223 log.info("Annotations 'snpsift'...") 3224 self.annotation_snpsift() 3225 if param.get("annotation", {}).get("bigwig", None): 3226 log.info("Annotations 'bigwig'...") 3227 self.annotation_bigwig() 3228 if param.get("annotation", {}).get("annovar", None): 3229 log.info("Annotations 'annovar'...") 3230 self.annotation_annovar() 3231 if param.get("annotation", {}).get("snpeff", None): 3232 log.info("Annotations 'snpeff'...") 3233 self.annotation_snpeff() 3234 if param.get("annotation", {}).get("exomiser", None) is not None: 3235 log.info("Annotations 'exomiser'...") 3236 self.annotation_exomiser() 3237 if param.get("annotation", {}).get("splice", None) is not None: 3238 log.info("Annotations 'splice' ...") 3239 self.annotation_splice() 3240 3241 # Explode INFOS fields into table fields 3242 if self.get_explode_infos(): 3243 self.explode_infos( 3244 prefix=self.get_explode_infos_prefix(), 3245 fields=self.get_explode_infos_fields(), 3246 force=True, 3247 ) 3248 3249 3250 def annotation_bigwig(self, threads: int = None) -> None: 3251 """ 3252 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3253 3254 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3255 number of threads to be used for parallel processing during the annotation process. If the 3256 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3257 threads to use based on the system configuration 3258 :type threads: int 3259 :return: True 3260 """ 3261 3262 # DEBUG 3263 log.debug("Start annotation with bigwig databases") 3264 3265 # # Threads 3266 # if not threads: 3267 # threads = self.get_threads() 3268 # log.debug("Threads: " + str(threads)) 3269 3270 # Config 3271 config = self.get_config() 3272 log.debug("Config: " + str(config)) 3273 3274 # Config - BCFTools databases folders 3275 databases_folders = set( 3276 self.get_config() 3277 .get("folders", {}) 3278 .get("databases", {}) 3279 .get("annotations", ["."]) 3280 + self.get_config() 3281 .get("folders", {}) 3282 .get("databases", {}) 3283 .get("bigwig", ["."]) 3284 ) 3285 log.debug("Databases annotations: " + str(databases_folders)) 3286 3287 # Param 3288 annotations = ( 3289 self.get_param() 3290 .get("annotation", {}) 3291 .get("bigwig", {}) 3292 .get("annotations", None) 3293 ) 3294 log.debug("Annotations: " + str(annotations)) 3295 3296 # Assembly 3297 assembly = self.get_param().get( 3298 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3299 ) 3300 3301 # Data 3302 table_variants = self.get_table_variants() 3303 3304 # Check if not empty 3305 log.debug("Check if not empty") 3306 sql_query_chromosomes = ( 3307 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3308 ) 3309 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3310 if not sql_query_chromosomes_df["count"][0]: 3311 log.info(f"VCF empty") 3312 return 3313 3314 # VCF header 3315 vcf_reader = self.get_header() 3316 log.debug("Initial header: " + str(vcf_reader.infos)) 3317 3318 # Existing annotations 3319 for vcf_annotation in self.get_header().infos: 3320 3321 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3322 log.debug( 3323 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3324 ) 3325 3326 if annotations: 3327 3328 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3329 3330 # Export VCF file 3331 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3332 3333 # annotation_bigwig_config 3334 annotation_bigwig_config_list = [] 3335 3336 for annotation in annotations: 3337 annotation_fields = annotations[annotation] 3338 3339 # Annotation Name 3340 annotation_name = os.path.basename(annotation) 3341 3342 if not annotation_fields: 3343 annotation_fields = {"INFO": None} 3344 3345 log.debug(f"Annotation '{annotation_name}'") 3346 log.debug( 3347 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3348 ) 3349 3350 # Create Database 3351 database = Database( 3352 database=annotation, 3353 databases_folders=databases_folders, 3354 assembly=assembly, 3355 ) 3356 3357 # Find files 3358 db_file = database.get_database() 3359 db_file = full_path(db_file) 3360 db_hdr_file = database.get_header_file() 3361 db_hdr_file = full_path(db_hdr_file) 3362 db_file_type = database.get_format() 3363 3364 # If db_file is http ? 3365 if database.get_database().startswith("http"): 3366 3367 # Datbase is HTTP URL 3368 db_file_is_http = True 3369 3370 # DB file keep as URL 3371 db_file = database.get_database() 3372 log.warning(f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)") 3373 3374 # Retrieve automatic annotation field name 3375 annotation_field = clean_annotation_field(os.path.basename(db_file).replace(".bw", "")) 3376 log.debug(f"Create header file with annotation field '{annotation_field}' is an HTTP URL") 3377 3378 # Create automatic header file 3379 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3380 with open(db_hdr_file, 'w') as f: 3381 f.write("##fileformat=VCFv4.2\n") 3382 f.write(f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""") 3383 f.write(f"#CHROM START END {annotation_field}\n") 3384 3385 else: 3386 3387 # Datbase is NOT HTTP URL 3388 db_file_is_http = False 3389 3390 3391 # Check index - try to create if not exists 3392 if db_file is None or db_hdr_file is None or (not os.path.exists(db_file) and not db_file_is_http) or not os.path.exists(db_hdr_file) or not db_file_type in ["bw"]: 3393 #if False: 3394 log.error("Annotation failed: database not valid") 3395 log.error(f"Annotation annotation file: {db_file}") 3396 log.error(f"Annotation annotation file type: {db_file_type}") 3397 log.error(f"Annotation annotation header: {db_hdr_file}") 3398 raise ValueError( 3399 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3400 ) 3401 else: 3402 3403 # Log 3404 log.debug( 3405 f"Annotation '{annotation}' - file: " 3406 + str(db_file) 3407 + " and " 3408 + str(db_hdr_file) 3409 ) 3410 3411 # Load header as VCF object 3412 db_hdr_vcf = Variants(input=db_hdr_file) 3413 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3414 log.debug( 3415 "Annotation database header: " 3416 + str(db_hdr_vcf_header_infos) 3417 ) 3418 3419 # For all fields in database 3420 annotation_fields_full = False 3421 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3422 annotation_fields = { 3423 key: key for key in db_hdr_vcf_header_infos 3424 } 3425 log.debug( 3426 "Annotation database header - All annotations added: " 3427 + str(annotation_fields) 3428 ) 3429 annotation_fields_full = True 3430 3431 # Init 3432 cyvcf2_header_rename_dict = {} 3433 cyvcf2_header_list = [] 3434 cyvcf2_header_indexes = {} 3435 3436 # process annotation fields 3437 for annotation_field in annotation_fields: 3438 3439 # New annotation name 3440 annotation_field_new = annotation_fields[annotation_field] 3441 3442 # Check annotation field and index in header 3443 if annotation_field in db_hdr_vcf.get_header_columns_as_list(): 3444 annotation_field_index = db_hdr_vcf.get_header_columns_as_list().index(annotation_field)-3 3445 cyvcf2_header_indexes[annotation_field_new] = annotation_field_index 3446 else: 3447 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3448 log.error(msg_err) 3449 raise ValueError(msg_err) 3450 3451 # Append annotation field in cyvcf2 header list 3452 cyvcf2_header_rename_dict[annotation_field_new] = db_hdr_vcf_header_infos[annotation_field].id 3453 cyvcf2_header_list.append( 3454 { 3455 "ID": annotation_field_new, 3456 "Number": db_hdr_vcf_header_infos[annotation_field].num, 3457 "Type": db_hdr_vcf_header_infos[annotation_field].type, 3458 "Description": db_hdr_vcf_header_infos[annotation_field].desc, 3459 } 3460 ) 3461 3462 # Load bigwig database 3463 bw_db = pyBigWig.open(db_file) 3464 if bw_db.isBigWig(): 3465 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3466 else: 3467 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3468 log.error(msg_err) 3469 raise ValueError(msg_err) 3470 3471 annotation_bigwig_config_list.append( 3472 { 3473 "db_file": db_file, 3474 "bw_db": bw_db, 3475 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3476 "cyvcf2_header_list": cyvcf2_header_list, 3477 "cyvcf2_header_indexes": cyvcf2_header_indexes 3478 } 3479 ) 3480 3481 # Annotate 3482 if annotation_bigwig_config_list: 3483 3484 # Annotation config 3485 log.debug(f"annotation_bigwig_config={annotation_bigwig_config_list}") 3486 3487 # Export VCF file 3488 self.export_variant_vcf( 3489 vcf_file=tmp_vcf_name, 3490 remove_info=True, 3491 add_samples=False, 3492 index=True, 3493 ) 3494 3495 # Load input tmp file 3496 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3497 3498 # Add header in input file 3499 for annotation_bigwig_config in annotation_bigwig_config_list: 3500 for cyvcf2_header_field in annotation_bigwig_config.get("cyvcf2_header_list",[]): 3501 log.info(f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'") 3502 input_vcf.add_info_to_header( 3503 cyvcf2_header_field 3504 ) 3505 3506 # Create output VCF file 3507 output_vcf_file = os.path.join(tmp_dir,"output.vcf.gz") 3508 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3509 3510 # Fetch variants 3511 log.info(f"Annotations 'bigwig' start...") 3512 for variant in input_vcf: 3513 3514 for annotation_bigwig_config in annotation_bigwig_config_list: 3515 3516 # DB and indexes 3517 bw_db = annotation_bigwig_config.get("bw_db", None) 3518 cyvcf2_header_indexes = annotation_bigwig_config.get("cyvcf2_header_indexes", None) 3519 3520 # Retrieve value from chrom pos 3521 res = bw_db.values(variant.CHROM, variant.POS - 1, variant.POS) 3522 3523 # For each annotation fields (and indexes) 3524 for cyvcf2_header_index in cyvcf2_header_indexes: 3525 3526 # If value is NOT nNone 3527 if not np.isnan(res[cyvcf2_header_indexes[cyvcf2_header_index]]): 3528 variant.INFO[cyvcf2_header_index] = res[cyvcf2_header_indexes[cyvcf2_header_index]] 3529 3530 # Add record in output file 3531 output_vcf.write_record(variant) 3532 3533 # Log 3534 log.debug(f"Annotation done.") 3535 3536 # Close and write file 3537 log.info(f"Annotations 'bigwig' write...") 3538 output_vcf.close() 3539 log.debug(f"Write done.") 3540 3541 # Update variants 3542 log.info(f"Annotations 'bigwig' update...") 3543 self.update_from_vcf(output_vcf_file) 3544 log.debug(f"Update done.") 3545 3546 return True 3547 3548 3549 def annotation_snpsift(self, threads: int = None) -> None: 3550 """ 3551 This function annotate with bcftools 3552 3553 :param threads: Number of threads to use 3554 :return: the value of the variable "return_value". 3555 """ 3556 3557 # DEBUG 3558 log.debug("Start annotation with bcftools databases") 3559 3560 # Threads 3561 if not threads: 3562 threads = self.get_threads() 3563 log.debug("Threads: " + str(threads)) 3564 3565 # Config 3566 config = self.get_config() 3567 log.debug("Config: " + str(config)) 3568 3569 # Config - snpSift 3570 snpsift_bin_command = get_bin_command( 3571 bin="SnpSift.jar", 3572 tool="snpsift", 3573 bin_type="jar", 3574 config=config, 3575 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3576 ) 3577 if not snpsift_bin_command: 3578 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3579 log.error(msg_err) 3580 raise ValueError(msg_err) 3581 3582 # Config - bcftools 3583 bcftools_bin_command = get_bin_command( 3584 bin="bcftools", 3585 tool="bcftools", 3586 bin_type="bin", 3587 config=config, 3588 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3589 ) 3590 if not bcftools_bin_command: 3591 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3592 log.error(msg_err) 3593 raise ValueError(msg_err) 3594 3595 # Config - BCFTools databases folders 3596 databases_folders = set( 3597 self.get_config() 3598 .get("folders", {}) 3599 .get("databases", {}) 3600 .get("annotations", ["."]) 3601 + self.get_config() 3602 .get("folders", {}) 3603 .get("databases", {}) 3604 .get("bcftools", ["."]) 3605 ) 3606 log.debug("Databases annotations: " + str(databases_folders)) 3607 3608 # Param 3609 annotations = ( 3610 self.get_param() 3611 .get("annotation", {}) 3612 .get("snpsift", {}) 3613 .get("annotations", None) 3614 ) 3615 log.debug("Annotations: " + str(annotations)) 3616 3617 # Assembly 3618 assembly = self.get_param().get( 3619 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3620 ) 3621 3622 # Data 3623 table_variants = self.get_table_variants() 3624 3625 # Check if not empty 3626 log.debug("Check if not empty") 3627 sql_query_chromosomes = ( 3628 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3629 ) 3630 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3631 if not sql_query_chromosomes_df["count"][0]: 3632 log.info(f"VCF empty") 3633 return 3634 3635 # VCF header 3636 vcf_reader = self.get_header() 3637 log.debug("Initial header: " + str(vcf_reader.infos)) 3638 3639 # Existing annotations 3640 for vcf_annotation in self.get_header().infos: 3641 3642 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3643 log.debug( 3644 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3645 ) 3646 3647 if annotations: 3648 3649 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3650 3651 # Export VCF file 3652 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3653 3654 # Init 3655 commands = {} 3656 3657 for annotation in annotations: 3658 annotation_fields = annotations[annotation] 3659 3660 # Annotation Name 3661 annotation_name = os.path.basename(annotation) 3662 3663 if not annotation_fields: 3664 annotation_fields = {"INFO": None} 3665 3666 log.debug(f"Annotation '{annotation_name}'") 3667 log.debug( 3668 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3669 ) 3670 3671 # Create Database 3672 database = Database( 3673 database=annotation, 3674 databases_folders=databases_folders, 3675 assembly=assembly, 3676 ) 3677 3678 # Find files 3679 db_file = database.get_database() 3680 db_file = full_path(db_file) 3681 db_hdr_file = database.get_header_file() 3682 db_hdr_file = full_path(db_hdr_file) 3683 db_file_type = database.get_format() 3684 db_tbi_file = f"{db_file}.tbi" 3685 db_file_compressed = database.is_compressed() 3686 3687 # Check if compressed 3688 if not db_file_compressed: 3689 log.error( 3690 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3691 ) 3692 raise ValueError( 3693 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3694 ) 3695 3696 # Check if indexed 3697 if not os.path.exists(db_tbi_file): 3698 log.error( 3699 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3700 ) 3701 raise ValueError( 3702 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3703 ) 3704 3705 # Check index - try to create if not exists 3706 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3707 log.error("Annotation failed: database not valid") 3708 log.error(f"Annotation annotation file: {db_file}") 3709 log.error(f"Annotation annotation header: {db_hdr_file}") 3710 log.error(f"Annotation annotation index: {db_tbi_file}") 3711 raise ValueError( 3712 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3713 ) 3714 else: 3715 3716 log.debug( 3717 f"Annotation '{annotation}' - file: " 3718 + str(db_file) 3719 + " and " 3720 + str(db_hdr_file) 3721 ) 3722 3723 # Load header as VCF object 3724 db_hdr_vcf = Variants(input=db_hdr_file) 3725 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3726 log.debug( 3727 "Annotation database header: " 3728 + str(db_hdr_vcf_header_infos) 3729 ) 3730 3731 # For all fields in database 3732 annotation_fields_full = False 3733 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3734 annotation_fields = { 3735 key: key for key in db_hdr_vcf_header_infos 3736 } 3737 log.debug( 3738 "Annotation database header - All annotations added: " 3739 + str(annotation_fields) 3740 ) 3741 annotation_fields_full = True 3742 3743 # # Create file for field rename 3744 # log.debug("Create file for field rename") 3745 # tmp_rename = NamedTemporaryFile( 3746 # prefix=self.get_prefix(), 3747 # dir=self.get_tmp_dir(), 3748 # suffix=".rename", 3749 # delete=False, 3750 # ) 3751 # tmp_rename_name = tmp_rename.name 3752 # tmp_files.append(tmp_rename_name) 3753 3754 # Number of fields 3755 nb_annotation_field = 0 3756 annotation_list = [] 3757 annotation_infos_rename_list = [] 3758 3759 for annotation_field in annotation_fields: 3760 3761 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3762 annotation_fields_new_name = annotation_fields.get( 3763 annotation_field, annotation_field 3764 ) 3765 if not annotation_fields_new_name: 3766 annotation_fields_new_name = annotation_field 3767 3768 # Check if field is in DB and if field is not elready in input data 3769 if ( 3770 annotation_field in db_hdr_vcf.get_header().infos 3771 and annotation_fields_new_name 3772 not in self.get_header().infos 3773 ): 3774 3775 log.info( 3776 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3777 ) 3778 3779 # BCFTools annotate param to rename fields 3780 if annotation_field != annotation_fields_new_name: 3781 annotation_infos_rename_list.append( 3782 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3783 ) 3784 3785 # Add INFO field to header 3786 db_hdr_vcf_header_infos_number = ( 3787 db_hdr_vcf_header_infos[annotation_field].num or "." 3788 ) 3789 db_hdr_vcf_header_infos_type = ( 3790 db_hdr_vcf_header_infos[annotation_field].type 3791 or "String" 3792 ) 3793 db_hdr_vcf_header_infos_description = ( 3794 db_hdr_vcf_header_infos[annotation_field].desc 3795 or f"{annotation_field} description" 3796 ) 3797 db_hdr_vcf_header_infos_source = ( 3798 db_hdr_vcf_header_infos[annotation_field].source 3799 or "unknown" 3800 ) 3801 db_hdr_vcf_header_infos_version = ( 3802 db_hdr_vcf_header_infos[annotation_field].version 3803 or "unknown" 3804 ) 3805 3806 vcf_reader.infos[annotation_fields_new_name] = ( 3807 vcf.parser._Info( 3808 annotation_fields_new_name, 3809 db_hdr_vcf_header_infos_number, 3810 db_hdr_vcf_header_infos_type, 3811 db_hdr_vcf_header_infos_description, 3812 db_hdr_vcf_header_infos_source, 3813 db_hdr_vcf_header_infos_version, 3814 self.code_type_map[ 3815 db_hdr_vcf_header_infos_type 3816 ], 3817 ) 3818 ) 3819 3820 annotation_list.append(annotation_field) 3821 3822 nb_annotation_field += 1 3823 3824 else: 3825 3826 if ( 3827 annotation_field 3828 not in db_hdr_vcf.get_header().infos 3829 ): 3830 log.warning( 3831 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3832 ) 3833 if ( 3834 annotation_fields_new_name 3835 in self.get_header().infos 3836 ): 3837 log.warning( 3838 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3839 ) 3840 3841 log.info( 3842 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3843 ) 3844 3845 annotation_infos = ",".join(annotation_list) 3846 3847 if annotation_infos != "": 3848 3849 # Annotated VCF (and error file) 3850 tmp_annotation_vcf_name = os.path.join( 3851 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3852 ) 3853 tmp_annotation_vcf_name_err = ( 3854 tmp_annotation_vcf_name + ".err" 3855 ) 3856 3857 # Add fields to annotate 3858 if not annotation_fields_full: 3859 annotation_infos_option = f"-info {annotation_infos}" 3860 else: 3861 annotation_infos_option = "" 3862 3863 # Info fields rename 3864 if annotation_infos_rename_list: 3865 annotation_infos_rename = " -c " + ",".join( 3866 annotation_infos_rename_list 3867 ) 3868 else: 3869 annotation_infos_rename = "" 3870 3871 # Annotate command 3872 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3873 3874 # Add command 3875 commands[command_annotate] = tmp_annotation_vcf_name 3876 3877 if commands: 3878 3879 # Export VCF file 3880 self.export_variant_vcf( 3881 vcf_file=tmp_vcf_name, 3882 remove_info=True, 3883 add_samples=False, 3884 index=True, 3885 ) 3886 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3887 3888 # Num command 3889 nb_command = 0 3890 3891 # Annotate 3892 for command_annotate in commands: 3893 nb_command += 1 3894 log.info( 3895 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3896 ) 3897 log.debug(f"command_annotate={command_annotate}") 3898 run_parallel_commands([command_annotate], threads) 3899 3900 # Debug 3901 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3902 3903 # Update variants 3904 log.info( 3905 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3906 ) 3907 self.update_from_vcf(commands[command_annotate]) 3908 3909 3910 def annotation_bcftools(self, threads: int = None) -> None: 3911 """ 3912 This function annotate with bcftools 3913 3914 :param threads: Number of threads to use 3915 :return: the value of the variable "return_value". 3916 """ 3917 3918 # DEBUG 3919 log.debug("Start annotation with bcftools databases") 3920 3921 # Threads 3922 if not threads: 3923 threads = self.get_threads() 3924 log.debug("Threads: " + str(threads)) 3925 3926 # Config 3927 config = self.get_config() 3928 log.debug("Config: " + str(config)) 3929 3930 # DEBUG 3931 delete_tmp = True 3932 if self.get_config().get("verbosity", "warning") in ["debug"]: 3933 delete_tmp = False 3934 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3935 3936 # Config - BCFTools bin command 3937 bcftools_bin_command = get_bin_command( 3938 bin="bcftools", 3939 tool="bcftools", 3940 bin_type="bin", 3941 config=config, 3942 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3943 ) 3944 if not bcftools_bin_command: 3945 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3946 log.error(msg_err) 3947 raise ValueError(msg_err) 3948 3949 # Config - BCFTools databases folders 3950 databases_folders = set( 3951 self.get_config() 3952 .get("folders", {}) 3953 .get("databases", {}) 3954 .get("annotations", ["."]) 3955 + self.get_config() 3956 .get("folders", {}) 3957 .get("databases", {}) 3958 .get("bcftools", ["."]) 3959 ) 3960 log.debug("Databases annotations: " + str(databases_folders)) 3961 3962 # Param 3963 annotations = ( 3964 self.get_param() 3965 .get("annotation", {}) 3966 .get("bcftools", {}) 3967 .get("annotations", None) 3968 ) 3969 log.debug("Annotations: " + str(annotations)) 3970 3971 # Assembly 3972 assembly = self.get_param().get( 3973 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3974 ) 3975 3976 # Data 3977 table_variants = self.get_table_variants() 3978 3979 # Check if not empty 3980 log.debug("Check if not empty") 3981 sql_query_chromosomes = ( 3982 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3983 ) 3984 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3985 if not sql_query_chromosomes_df["count"][0]: 3986 log.info(f"VCF empty") 3987 return 3988 3989 # Export in VCF 3990 log.debug("Create initial file to annotate") 3991 tmp_vcf = NamedTemporaryFile( 3992 prefix=self.get_prefix(), 3993 dir=self.get_tmp_dir(), 3994 suffix=".vcf.gz", 3995 delete=False, 3996 ) 3997 tmp_vcf_name = tmp_vcf.name 3998 3999 # VCF header 4000 vcf_reader = self.get_header() 4001 log.debug("Initial header: " + str(vcf_reader.infos)) 4002 4003 # Existing annotations 4004 for vcf_annotation in self.get_header().infos: 4005 4006 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4007 log.debug( 4008 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4009 ) 4010 4011 if annotations: 4012 4013 tmp_ann_vcf_list = [] 4014 commands = [] 4015 tmp_files = [] 4016 err_files = [] 4017 4018 for annotation in annotations: 4019 annotation_fields = annotations[annotation] 4020 4021 # Annotation Name 4022 annotation_name = os.path.basename(annotation) 4023 4024 if not annotation_fields: 4025 annotation_fields = {"INFO": None} 4026 4027 log.debug(f"Annotation '{annotation_name}'") 4028 log.debug( 4029 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4030 ) 4031 4032 # Create Database 4033 database = Database( 4034 database=annotation, 4035 databases_folders=databases_folders, 4036 assembly=assembly, 4037 ) 4038 4039 # Find files 4040 db_file = database.get_database() 4041 db_file = full_path(db_file) 4042 db_hdr_file = database.get_header_file() 4043 db_hdr_file = full_path(db_hdr_file) 4044 db_file_type = database.get_format() 4045 db_tbi_file = f"{db_file}.tbi" 4046 db_file_compressed = database.is_compressed() 4047 4048 # Check if compressed 4049 if not db_file_compressed: 4050 log.error( 4051 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4052 ) 4053 raise ValueError( 4054 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4055 ) 4056 4057 # Check if indexed 4058 if not os.path.exists(db_tbi_file): 4059 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4060 raise ValueError( 4061 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4062 ) 4063 4064 # Check index - try to create if not exists 4065 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4066 log.error("Annotation failed: database not valid") 4067 log.error(f"Annotation annotation file: {db_file}") 4068 log.error(f"Annotation annotation header: {db_hdr_file}") 4069 log.error(f"Annotation annotation index: {db_tbi_file}") 4070 raise ValueError( 4071 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4072 ) 4073 else: 4074 4075 log.debug( 4076 f"Annotation '{annotation}' - file: " 4077 + str(db_file) 4078 + " and " 4079 + str(db_hdr_file) 4080 ) 4081 4082 # Load header as VCF object 4083 db_hdr_vcf = Variants(input=db_hdr_file) 4084 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4085 log.debug( 4086 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4087 ) 4088 4089 # For all fields in database 4090 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4091 annotation_fields = { 4092 key: key for key in db_hdr_vcf_header_infos 4093 } 4094 log.debug( 4095 "Annotation database header - All annotations added: " 4096 + str(annotation_fields) 4097 ) 4098 4099 # Number of fields 4100 nb_annotation_field = 0 4101 annotation_list = [] 4102 4103 for annotation_field in annotation_fields: 4104 4105 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4106 annotation_fields_new_name = annotation_fields.get( 4107 annotation_field, annotation_field 4108 ) 4109 if not annotation_fields_new_name: 4110 annotation_fields_new_name = annotation_field 4111 4112 # Check if field is in DB and if field is not elready in input data 4113 if ( 4114 annotation_field in db_hdr_vcf.get_header().infos 4115 and annotation_fields_new_name 4116 not in self.get_header().infos 4117 ): 4118 4119 log.info( 4120 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4121 ) 4122 4123 # Add INFO field to header 4124 db_hdr_vcf_header_infos_number = ( 4125 db_hdr_vcf_header_infos[annotation_field].num or "." 4126 ) 4127 db_hdr_vcf_header_infos_type = ( 4128 db_hdr_vcf_header_infos[annotation_field].type 4129 or "String" 4130 ) 4131 db_hdr_vcf_header_infos_description = ( 4132 db_hdr_vcf_header_infos[annotation_field].desc 4133 or f"{annotation_field} description" 4134 ) 4135 db_hdr_vcf_header_infos_source = ( 4136 db_hdr_vcf_header_infos[annotation_field].source 4137 or "unknown" 4138 ) 4139 db_hdr_vcf_header_infos_version = ( 4140 db_hdr_vcf_header_infos[annotation_field].version 4141 or "unknown" 4142 ) 4143 4144 vcf_reader.infos[annotation_fields_new_name] = ( 4145 vcf.parser._Info( 4146 annotation_fields_new_name, 4147 db_hdr_vcf_header_infos_number, 4148 db_hdr_vcf_header_infos_type, 4149 db_hdr_vcf_header_infos_description, 4150 db_hdr_vcf_header_infos_source, 4151 db_hdr_vcf_header_infos_version, 4152 self.code_type_map[db_hdr_vcf_header_infos_type], 4153 ) 4154 ) 4155 4156 # annotation_list.append(annotation_field) 4157 if annotation_field != annotation_fields_new_name: 4158 annotation_list.append( 4159 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4160 ) 4161 else: 4162 annotation_list.append(annotation_field) 4163 4164 nb_annotation_field += 1 4165 4166 else: 4167 4168 if annotation_field not in db_hdr_vcf.get_header().infos: 4169 log.warning( 4170 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4171 ) 4172 if annotation_fields_new_name in self.get_header().infos: 4173 log.warning( 4174 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4175 ) 4176 4177 log.info( 4178 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4179 ) 4180 4181 annotation_infos = ",".join(annotation_list) 4182 4183 if annotation_infos != "": 4184 4185 # Protect header for bcftools (remove "#CHROM" and variants line) 4186 log.debug("Protect Header file - remove #CHROM line if exists") 4187 tmp_header_vcf = NamedTemporaryFile( 4188 prefix=self.get_prefix(), 4189 dir=self.get_tmp_dir(), 4190 suffix=".hdr", 4191 delete=False, 4192 ) 4193 tmp_header_vcf_name = tmp_header_vcf.name 4194 tmp_files.append(tmp_header_vcf_name) 4195 # Command 4196 if db_hdr_file.endswith(".gz"): 4197 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4198 else: 4199 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4200 # Run 4201 run_parallel_commands([command_extract_header], 1) 4202 4203 # Find chomosomes 4204 log.debug("Find chromosomes ") 4205 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4206 sql_query_chromosomes_df = self.get_query_to_df( 4207 sql_query_chromosomes 4208 ) 4209 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4210 4211 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4212 4213 # BED columns in the annotation file 4214 if db_file_type in ["bed"]: 4215 annotation_infos = "CHROM,POS,POS," + annotation_infos 4216 4217 for chrom in chomosomes_list: 4218 4219 # Create BED on initial VCF 4220 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4221 tmp_bed = NamedTemporaryFile( 4222 prefix=self.get_prefix(), 4223 dir=self.get_tmp_dir(), 4224 suffix=".bed", 4225 delete=False, 4226 ) 4227 tmp_bed_name = tmp_bed.name 4228 tmp_files.append(tmp_bed_name) 4229 4230 # Detecte regions 4231 log.debug( 4232 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4233 ) 4234 window = 1000000 4235 sql_query_intervals_for_bed = f""" 4236 SELECT \"#CHROM\", 4237 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4238 \"POS\"+{window} 4239 FROM {table_variants} as table_variants 4240 WHERE table_variants.\"#CHROM\" = '{chrom}' 4241 """ 4242 regions = self.conn.execute( 4243 sql_query_intervals_for_bed 4244 ).fetchall() 4245 merged_regions = merge_regions(regions) 4246 log.debug( 4247 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4248 ) 4249 4250 header = ["#CHROM", "START", "END"] 4251 with open(tmp_bed_name, "w") as f: 4252 # Write the header with tab delimiter 4253 f.write("\t".join(header) + "\n") 4254 for d in merged_regions: 4255 # Write each data row with tab delimiter 4256 f.write("\t".join(map(str, d)) + "\n") 4257 4258 # Tmp files 4259 tmp_annotation_vcf = NamedTemporaryFile( 4260 prefix=self.get_prefix(), 4261 dir=self.get_tmp_dir(), 4262 suffix=".vcf.gz", 4263 delete=False, 4264 ) 4265 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4266 tmp_files.append(tmp_annotation_vcf_name) 4267 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4268 tmp_annotation_vcf_name_err = ( 4269 tmp_annotation_vcf_name + ".err" 4270 ) 4271 err_files.append(tmp_annotation_vcf_name_err) 4272 4273 # Annotate Command 4274 log.debug( 4275 f"Annotation '{annotation}' - add bcftools command" 4276 ) 4277 4278 # Command 4279 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4280 4281 # Add command 4282 commands.append(command_annotate) 4283 4284 # if some commands 4285 if commands: 4286 4287 # Export VCF file 4288 self.export_variant_vcf( 4289 vcf_file=tmp_vcf_name, 4290 remove_info=True, 4291 add_samples=False, 4292 index=True, 4293 ) 4294 4295 # Threads 4296 # calculate threads for annotated commands 4297 if commands: 4298 threads_bcftools_annotate = round(threads / len(commands)) 4299 else: 4300 threads_bcftools_annotate = 1 4301 4302 if not threads_bcftools_annotate: 4303 threads_bcftools_annotate = 1 4304 4305 # Add threads option to bcftools commands 4306 if threads_bcftools_annotate > 1: 4307 commands_threaded = [] 4308 for command in commands: 4309 commands_threaded.append( 4310 command.replace( 4311 f"{bcftools_bin_command} annotate ", 4312 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4313 ) 4314 ) 4315 commands = commands_threaded 4316 4317 # Command annotation multithreading 4318 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4319 log.info( 4320 f"Annotation - Annotation multithreaded in " 4321 + str(len(commands)) 4322 + " commands" 4323 ) 4324 4325 run_parallel_commands(commands, threads) 4326 4327 # Merge 4328 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4329 4330 if tmp_ann_vcf_list_cmd: 4331 4332 # Tmp file 4333 tmp_annotate_vcf = NamedTemporaryFile( 4334 prefix=self.get_prefix(), 4335 dir=self.get_tmp_dir(), 4336 suffix=".vcf.gz", 4337 delete=True, 4338 ) 4339 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4340 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4341 err_files.append(tmp_annotate_vcf_name_err) 4342 4343 # Tmp file remove command 4344 tmp_files_remove_command = "" 4345 if tmp_files: 4346 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4347 4348 # Command merge 4349 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4350 log.info( 4351 f"Annotation - Annotation merging " 4352 + str(len(commands)) 4353 + " annotated files" 4354 ) 4355 log.debug(f"Annotation - merge command: {merge_command}") 4356 run_parallel_commands([merge_command], 1) 4357 4358 # Error messages 4359 log.info(f"Error/Warning messages:") 4360 error_message_command_all = [] 4361 error_message_command_warning = [] 4362 error_message_command_err = [] 4363 for err_file in err_files: 4364 with open(err_file, "r") as f: 4365 for line in f: 4366 message = line.strip() 4367 error_message_command_all.append(message) 4368 if line.startswith("[W::"): 4369 error_message_command_warning.append(message) 4370 if line.startswith("[E::"): 4371 error_message_command_err.append( 4372 f"{err_file}: " + message 4373 ) 4374 # log info 4375 for message in list( 4376 set(error_message_command_err + error_message_command_warning) 4377 ): 4378 log.info(f" {message}") 4379 # debug info 4380 for message in list(set(error_message_command_all)): 4381 log.debug(f" {message}") 4382 # failed 4383 if len(error_message_command_err): 4384 log.error("Annotation failed: Error in commands") 4385 raise ValueError("Annotation failed: Error in commands") 4386 4387 # Update variants 4388 log.info(f"Annotation - Updating...") 4389 self.update_from_vcf(tmp_annotate_vcf_name) 4390 4391 def annotation_exomiser(self, threads: int = None) -> None: 4392 """ 4393 This function annotate with Exomiser 4394 4395 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4396 - "analysis" (dict/file): 4397 Full analysis dictionnary parameters (see Exomiser docs). 4398 Either a dict, or a file in JSON or YAML format. 4399 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4400 Default : None 4401 - "preset" (string): 4402 Analysis preset (available in config folder). 4403 Used if no full "analysis" is provided. 4404 Default: "exome" 4405 - "phenopacket" (dict/file): 4406 Samples and phenotipic features parameters (see Exomiser docs). 4407 Either a dict, or a file in JSON or YAML format. 4408 Default: None 4409 - "subject" (dict): 4410 Sample parameters (see Exomiser docs). 4411 Example: 4412 "subject": 4413 { 4414 "id": "ISDBM322017", 4415 "sex": "FEMALE" 4416 } 4417 Default: None 4418 - "sample" (string): 4419 Sample name to construct "subject" section: 4420 "subject": 4421 { 4422 "id": "<sample>", 4423 "sex": "UNKNOWN_SEX" 4424 } 4425 Default: None 4426 - "phenotypicFeatures" (dict) 4427 Phenotypic features to construct "subject" section. 4428 Example: 4429 "phenotypicFeatures": 4430 [ 4431 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4432 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4433 ] 4434 - "hpo" (list) 4435 List of HPO ids as phenotypic features. 4436 Example: 4437 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4438 Default: [] 4439 - "outputOptions" (dict): 4440 Output options (see Exomiser docs). 4441 Default: 4442 "output_options" = 4443 { 4444 "outputContributingVariantsOnly": False, 4445 "numGenes": 0, 4446 "outputFormats": ["TSV_VARIANT", "VCF"] 4447 } 4448 - "transcript_source" (string): 4449 Transcript source (either "refseq", "ucsc", "ensembl") 4450 Default: "refseq" 4451 - "exomiser_to_info" (boolean): 4452 Add exomiser TSV file columns as INFO fields in VCF. 4453 Default: False 4454 - "release" (string): 4455 Exomise database release. 4456 If not exists, database release will be downloaded (take a while). 4457 Default: None (provided by application.properties configuration file) 4458 - "exomiser_application_properties" (file): 4459 Exomiser configuration file (see Exomiser docs). 4460 Useful to automatically download databases (especially for specific genome databases). 4461 4462 Notes: 4463 - If no sample in parameters, first sample in VCF will be chosen 4464 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4465 4466 :param threads: The number of threads to use 4467 :return: None. 4468 """ 4469 4470 # DEBUG 4471 log.debug("Start annotation with Exomiser databases") 4472 4473 # Threads 4474 if not threads: 4475 threads = self.get_threads() 4476 log.debug("Threads: " + str(threads)) 4477 4478 # Config 4479 config = self.get_config() 4480 log.debug("Config: " + str(config)) 4481 4482 # Config - Folders - Databases 4483 databases_folders = ( 4484 config.get("folders", {}) 4485 .get("databases", {}) 4486 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4487 ) 4488 databases_folders = full_path(databases_folders) 4489 if not os.path.exists(databases_folders): 4490 log.error(f"Databases annotations: {databases_folders} NOT found") 4491 log.debug("Databases annotations: " + str(databases_folders)) 4492 4493 # Config - Exomiser 4494 exomiser_bin_command = get_bin_command( 4495 bin="exomiser-cli*.jar", 4496 tool="exomiser", 4497 bin_type="jar", 4498 config=config, 4499 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4500 ) 4501 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4502 if not exomiser_bin_command: 4503 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4504 log.error(msg_err) 4505 raise ValueError(msg_err) 4506 4507 # Param 4508 param = self.get_param() 4509 log.debug("Param: " + str(param)) 4510 4511 # Param - Exomiser 4512 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4513 log.debug(f"Param Exomiser: {param_exomiser}") 4514 4515 # Param - Assembly 4516 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4517 log.debug("Assembly: " + str(assembly)) 4518 4519 # Data 4520 table_variants = self.get_table_variants() 4521 4522 # Check if not empty 4523 log.debug("Check if not empty") 4524 sql_query_chromosomes = ( 4525 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4526 ) 4527 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4528 log.info(f"VCF empty") 4529 return False 4530 4531 # VCF header 4532 vcf_reader = self.get_header() 4533 log.debug("Initial header: " + str(vcf_reader.infos)) 4534 4535 # Samples 4536 samples = self.get_header_sample_list() 4537 if not samples: 4538 log.error("No Samples in VCF") 4539 return False 4540 log.debug(f"Samples: {samples}") 4541 4542 # Memory limit 4543 memory_limit = self.get_memory("8G") 4544 log.debug(f"memory_limit: {memory_limit}") 4545 4546 # Exomiser java options 4547 exomiser_java_options = ( 4548 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4549 ) 4550 log.debug(f"Exomiser java options: {exomiser_java_options}") 4551 4552 # Download Exomiser (if not exists) 4553 exomiser_release = param_exomiser.get("release", None) 4554 exomiser_application_properties = param_exomiser.get( 4555 "exomiser_application_properties", None 4556 ) 4557 databases_download_exomiser( 4558 assemblies=[assembly], 4559 exomiser_folder=databases_folders, 4560 exomiser_release=exomiser_release, 4561 exomiser_phenotype_release=exomiser_release, 4562 exomiser_application_properties=exomiser_application_properties, 4563 ) 4564 4565 # Force annotation 4566 force_update_annotation = True 4567 4568 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4569 log.debug("Start annotation Exomiser") 4570 4571 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4572 4573 # tmp_dir = "/tmp/exomiser" 4574 4575 ### ANALYSIS ### 4576 ################ 4577 4578 # Create analysis.json through analysis dict 4579 # either analysis in param or by default 4580 # depending on preset exome/genome) 4581 4582 # Init analysis dict 4583 param_exomiser_analysis_dict = {} 4584 4585 # analysis from param 4586 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4587 param_exomiser_analysis = full_path(param_exomiser_analysis) 4588 4589 # If analysis in param -> load anlaysis json 4590 if param_exomiser_analysis: 4591 4592 # If param analysis is a file and exists 4593 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4594 param_exomiser_analysis 4595 ): 4596 # Load analysis file into analysis dict (either yaml or json) 4597 with open(param_exomiser_analysis) as json_file: 4598 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4599 4600 # If param analysis is a dict 4601 elif isinstance(param_exomiser_analysis, dict): 4602 # Load analysis dict into analysis dict (either yaml or json) 4603 param_exomiser_analysis_dict = param_exomiser_analysis 4604 4605 # Error analysis type 4606 else: 4607 log.error(f"Analysis type unknown. Check param file.") 4608 raise ValueError(f"Analysis type unknown. Check param file.") 4609 4610 # Case no input analysis config file/dict 4611 # Use preset (exome/genome) to open default config file 4612 if not param_exomiser_analysis_dict: 4613 4614 # default preset 4615 default_preset = "exome" 4616 4617 # Get param preset or default preset 4618 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4619 4620 # Try to find if preset is a file 4621 if os.path.exists(param_exomiser_preset): 4622 # Preset file is provided in full path 4623 param_exomiser_analysis_default_config_file = ( 4624 param_exomiser_preset 4625 ) 4626 # elif os.path.exists(full_path(param_exomiser_preset)): 4627 # # Preset file is provided in full path 4628 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4629 elif os.path.exists( 4630 os.path.join(folder_config, param_exomiser_preset) 4631 ): 4632 # Preset file is provided a basename in config folder (can be a path with subfolders) 4633 param_exomiser_analysis_default_config_file = os.path.join( 4634 folder_config, param_exomiser_preset 4635 ) 4636 else: 4637 # Construct preset file 4638 param_exomiser_analysis_default_config_file = os.path.join( 4639 folder_config, 4640 f"preset-{param_exomiser_preset}-analysis.json", 4641 ) 4642 4643 # If preset file exists 4644 param_exomiser_analysis_default_config_file = full_path( 4645 param_exomiser_analysis_default_config_file 4646 ) 4647 if os.path.exists(param_exomiser_analysis_default_config_file): 4648 # Load prest file into analysis dict (either yaml or json) 4649 with open( 4650 param_exomiser_analysis_default_config_file 4651 ) as json_file: 4652 # param_exomiser_analysis_dict[""] = json.load(json_file) 4653 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4654 json_file 4655 ) 4656 4657 # Error preset file 4658 else: 4659 log.error( 4660 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4661 ) 4662 raise ValueError( 4663 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4664 ) 4665 4666 # If no analysis dict created 4667 if not param_exomiser_analysis_dict: 4668 log.error(f"No analysis config") 4669 raise ValueError(f"No analysis config") 4670 4671 # Log 4672 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4673 4674 ### PHENOPACKET ### 4675 ################### 4676 4677 # If no PhenoPacket in analysis dict -> check in param 4678 if "phenopacket" not in param_exomiser_analysis_dict: 4679 4680 # If PhenoPacket in param -> load anlaysis json 4681 if param_exomiser.get("phenopacket", None): 4682 4683 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4684 param_exomiser_phenopacket = full_path( 4685 param_exomiser_phenopacket 4686 ) 4687 4688 # If param phenopacket is a file and exists 4689 if isinstance( 4690 param_exomiser_phenopacket, str 4691 ) and os.path.exists(param_exomiser_phenopacket): 4692 # Load phenopacket file into analysis dict (either yaml or json) 4693 with open(param_exomiser_phenopacket) as json_file: 4694 param_exomiser_analysis_dict["phenopacket"] = ( 4695 yaml.safe_load(json_file) 4696 ) 4697 4698 # If param phenopacket is a dict 4699 elif isinstance(param_exomiser_phenopacket, dict): 4700 # Load phenopacket dict into analysis dict (either yaml or json) 4701 param_exomiser_analysis_dict["phenopacket"] = ( 4702 param_exomiser_phenopacket 4703 ) 4704 4705 # Error phenopacket type 4706 else: 4707 log.error(f"Phenopacket type unknown. Check param file.") 4708 raise ValueError( 4709 f"Phenopacket type unknown. Check param file." 4710 ) 4711 4712 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4713 if "phenopacket" not in param_exomiser_analysis_dict: 4714 4715 # Init PhenoPacket 4716 param_exomiser_analysis_dict["phenopacket"] = { 4717 "id": "analysis", 4718 "proband": {}, 4719 } 4720 4721 ### Add subject ### 4722 4723 # If subject exists 4724 param_exomiser_subject = param_exomiser.get("subject", {}) 4725 4726 # If subject not exists -> found sample ID 4727 if not param_exomiser_subject: 4728 4729 # Found sample ID in param 4730 sample = param_exomiser.get("sample", None) 4731 4732 # Find sample ID (first sample) 4733 if not sample: 4734 sample_list = self.get_header_sample_list() 4735 if len(sample_list) > 0: 4736 sample = sample_list[0] 4737 else: 4738 log.error(f"No sample found") 4739 raise ValueError(f"No sample found") 4740 4741 # Create subject 4742 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4743 4744 # Add to dict 4745 param_exomiser_analysis_dict["phenopacket"][ 4746 "subject" 4747 ] = param_exomiser_subject 4748 4749 ### Add "phenotypicFeatures" ### 4750 4751 # If phenotypicFeatures exists 4752 param_exomiser_phenotypicfeatures = param_exomiser.get( 4753 "phenotypicFeatures", [] 4754 ) 4755 4756 # If phenotypicFeatures not exists -> Try to infer from hpo list 4757 if not param_exomiser_phenotypicfeatures: 4758 4759 # Found HPO in param 4760 param_exomiser_hpo = param_exomiser.get("hpo", []) 4761 4762 # Split HPO if list in string format separated by comma 4763 if isinstance(param_exomiser_hpo, str): 4764 param_exomiser_hpo = param_exomiser_hpo.split(",") 4765 4766 # Create HPO list 4767 for hpo in param_exomiser_hpo: 4768 hpo_clean = re.sub("[^0-9]", "", hpo) 4769 param_exomiser_phenotypicfeatures.append( 4770 { 4771 "type": { 4772 "id": f"HP:{hpo_clean}", 4773 "label": f"HP:{hpo_clean}", 4774 } 4775 } 4776 ) 4777 4778 # Add to dict 4779 param_exomiser_analysis_dict["phenopacket"][ 4780 "phenotypicFeatures" 4781 ] = param_exomiser_phenotypicfeatures 4782 4783 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4784 if not param_exomiser_phenotypicfeatures: 4785 for step in param_exomiser_analysis_dict.get( 4786 "analysis", {} 4787 ).get("steps", []): 4788 if "hiPhivePrioritiser" in step: 4789 param_exomiser_analysis_dict.get("analysis", {}).get( 4790 "steps", [] 4791 ).remove(step) 4792 4793 ### Add Input File ### 4794 4795 # Initial file name and htsFiles 4796 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4797 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4798 { 4799 "uri": tmp_vcf_name, 4800 "htsFormat": "VCF", 4801 "genomeAssembly": assembly, 4802 } 4803 ] 4804 4805 ### Add metaData ### 4806 4807 # If metaData not in analysis dict 4808 if "metaData" not in param_exomiser_analysis_dict: 4809 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4810 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4811 "createdBy": "howard", 4812 "phenopacketSchemaVersion": 1, 4813 } 4814 4815 ### OutputOptions ### 4816 4817 # Init output result folder 4818 output_results = os.path.join(tmp_dir, "results") 4819 4820 # If no outputOptions in analysis dict 4821 if "outputOptions" not in param_exomiser_analysis_dict: 4822 4823 # default output formats 4824 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4825 4826 # Get outputOptions in param 4827 output_options = param_exomiser.get("outputOptions", None) 4828 4829 # If no output_options in param -> check 4830 if not output_options: 4831 output_options = { 4832 "outputContributingVariantsOnly": False, 4833 "numGenes": 0, 4834 "outputFormats": defaut_output_formats, 4835 } 4836 4837 # Replace outputDirectory in output options 4838 output_options["outputDirectory"] = output_results 4839 output_options["outputFileName"] = "howard" 4840 4841 # Add outputOptions in analysis dict 4842 param_exomiser_analysis_dict["outputOptions"] = output_options 4843 4844 else: 4845 4846 # Replace output_results and output format (if exists in param) 4847 param_exomiser_analysis_dict["outputOptions"][ 4848 "outputDirectory" 4849 ] = output_results 4850 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4851 list( 4852 set( 4853 param_exomiser_analysis_dict.get( 4854 "outputOptions", {} 4855 ).get("outputFormats", []) 4856 + ["TSV_VARIANT", "VCF"] 4857 ) 4858 ) 4859 ) 4860 4861 # log 4862 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4863 4864 ### ANALYSIS FILE ### 4865 ##################### 4866 4867 ### Full JSON analysis config file ### 4868 4869 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4870 with open(exomiser_analysis, "w") as fp: 4871 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4872 4873 ### SPLIT analysis and sample config files 4874 4875 # Splitted analysis dict 4876 param_exomiser_analysis_dict_for_split = ( 4877 param_exomiser_analysis_dict.copy() 4878 ) 4879 4880 # Phenopacket JSON file 4881 exomiser_analysis_phenopacket = os.path.join( 4882 tmp_dir, "analysis_phenopacket.json" 4883 ) 4884 with open(exomiser_analysis_phenopacket, "w") as fp: 4885 json.dump( 4886 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4887 fp, 4888 indent=4, 4889 ) 4890 4891 # Analysis JSON file without Phenopacket parameters 4892 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4893 exomiser_analysis_analysis = os.path.join( 4894 tmp_dir, "analysis_analysis.json" 4895 ) 4896 with open(exomiser_analysis_analysis, "w") as fp: 4897 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4898 4899 ### INITAL VCF file ### 4900 ####################### 4901 4902 ### Create list of samples to use and include inti initial VCF file #### 4903 4904 # Subject (main sample) 4905 # Get sample ID in analysis dict 4906 sample_subject = ( 4907 param_exomiser_analysis_dict.get("phenopacket", {}) 4908 .get("subject", {}) 4909 .get("id", None) 4910 ) 4911 sample_proband = ( 4912 param_exomiser_analysis_dict.get("phenopacket", {}) 4913 .get("proband", {}) 4914 .get("subject", {}) 4915 .get("id", None) 4916 ) 4917 sample = [] 4918 if sample_subject: 4919 sample.append(sample_subject) 4920 if sample_proband: 4921 sample.append(sample_proband) 4922 4923 # Get sample ID within Pedigree 4924 pedigree_persons_list = ( 4925 param_exomiser_analysis_dict.get("phenopacket", {}) 4926 .get("pedigree", {}) 4927 .get("persons", {}) 4928 ) 4929 4930 # Create list with all sample ID in pedigree (if exists) 4931 pedigree_persons = [] 4932 for person in pedigree_persons_list: 4933 pedigree_persons.append(person.get("individualId")) 4934 4935 # Concat subject sample ID and samples ID in pedigreesamples 4936 samples = list(set(sample + pedigree_persons)) 4937 4938 # Check if sample list is not empty 4939 if not samples: 4940 log.error(f"No samples found") 4941 raise ValueError(f"No samples found") 4942 4943 # Create VCF with sample (either sample in param or first one by default) 4944 # Export VCF file 4945 self.export_variant_vcf( 4946 vcf_file=tmp_vcf_name, 4947 remove_info=True, 4948 add_samples=True, 4949 list_samples=samples, 4950 index=False, 4951 ) 4952 4953 ### Execute Exomiser ### 4954 ######################## 4955 4956 # Init command 4957 exomiser_command = "" 4958 4959 # Command exomiser options 4960 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4961 4962 # Release 4963 exomiser_release = param_exomiser.get("release", None) 4964 if exomiser_release: 4965 # phenotype data version 4966 exomiser_options += ( 4967 f" --exomiser.phenotype.data-version={exomiser_release} " 4968 ) 4969 # data version 4970 exomiser_options += ( 4971 f" --exomiser.{assembly}.data-version={exomiser_release} " 4972 ) 4973 # variant white list 4974 variant_white_list_file = ( 4975 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4976 ) 4977 if os.path.exists( 4978 os.path.join( 4979 databases_folders, assembly, variant_white_list_file 4980 ) 4981 ): 4982 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4983 4984 # transcript_source 4985 transcript_source = param_exomiser.get( 4986 "transcript_source", None 4987 ) # ucsc, refseq, ensembl 4988 if transcript_source: 4989 exomiser_options += ( 4990 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4991 ) 4992 4993 # If analysis contain proband param 4994 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4995 "proband", {} 4996 ): 4997 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4998 4999 # If no proband (usually uniq sample) 5000 else: 5001 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5002 5003 # Log 5004 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5005 5006 # Run command 5007 result = subprocess.call( 5008 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5009 ) 5010 if result: 5011 log.error("Exomiser command failed") 5012 raise ValueError("Exomiser command failed") 5013 5014 ### RESULTS ### 5015 ############### 5016 5017 ### Annotate with TSV fields ### 5018 5019 # Init result tsv file 5020 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5021 5022 # Init result tsv file 5023 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5024 5025 # Parse TSV file and explode columns in INFO field 5026 if exomiser_to_info and os.path.exists(output_results_tsv): 5027 5028 # Log 5029 log.debug("Exomiser columns to VCF INFO field") 5030 5031 # Retrieve columns and types 5032 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5033 output_results_tsv_df = self.get_query_to_df(query) 5034 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5035 5036 # Init concat fields for update 5037 sql_query_update_concat_fields = [] 5038 5039 # Fields to avoid 5040 fields_to_avoid = [ 5041 "CONTIG", 5042 "START", 5043 "END", 5044 "REF", 5045 "ALT", 5046 "QUAL", 5047 "FILTER", 5048 "GENOTYPE", 5049 ] 5050 5051 # List all columns to add into header 5052 for header_column in output_results_tsv_columns: 5053 5054 # If header column is enable 5055 if header_column not in fields_to_avoid: 5056 5057 # Header info type 5058 header_info_type = "String" 5059 header_column_df = output_results_tsv_df[header_column] 5060 header_column_df_dtype = header_column_df.dtype 5061 if header_column_df_dtype == object: 5062 if ( 5063 pd.to_numeric(header_column_df, errors="coerce") 5064 .notnull() 5065 .all() 5066 ): 5067 header_info_type = "Float" 5068 else: 5069 header_info_type = "Integer" 5070 5071 # Header info 5072 characters_to_validate = ["-"] 5073 pattern = "[" + "".join(characters_to_validate) + "]" 5074 header_info_name = re.sub( 5075 pattern, 5076 "_", 5077 f"Exomiser_{header_column}".replace("#", ""), 5078 ) 5079 header_info_number = "." 5080 header_info_description = ( 5081 f"Exomiser {header_column} annotation" 5082 ) 5083 header_info_source = "Exomiser" 5084 header_info_version = "unknown" 5085 header_info_code = CODE_TYPE_MAP[header_info_type] 5086 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5087 header_info_name, 5088 header_info_number, 5089 header_info_type, 5090 header_info_description, 5091 header_info_source, 5092 header_info_version, 5093 header_info_code, 5094 ) 5095 5096 # Add field to add for update to concat fields 5097 sql_query_update_concat_fields.append( 5098 f""" 5099 CASE 5100 WHEN table_parquet."{header_column}" NOT IN ('','.') 5101 THEN concat( 5102 '{header_info_name}=', 5103 table_parquet."{header_column}", 5104 ';' 5105 ) 5106 5107 ELSE '' 5108 END 5109 """ 5110 ) 5111 5112 # Update query 5113 sql_query_update = f""" 5114 UPDATE {table_variants} as table_variants 5115 SET INFO = concat( 5116 CASE 5117 WHEN INFO NOT IN ('', '.') 5118 THEN INFO 5119 ELSE '' 5120 END, 5121 CASE 5122 WHEN table_variants.INFO NOT IN ('','.') 5123 THEN ';' 5124 ELSE '' 5125 END, 5126 ( 5127 SELECT 5128 concat( 5129 {",".join(sql_query_update_concat_fields)} 5130 ) 5131 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5132 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5133 AND table_parquet.\"START\" = table_variants.\"POS\" 5134 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5135 AND table_parquet.\"REF\" = table_variants.\"REF\" 5136 ) 5137 ) 5138 ; 5139 """ 5140 5141 # Update 5142 self.conn.execute(sql_query_update) 5143 5144 ### Annotate with VCF INFO field ### 5145 5146 # Init result VCF file 5147 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5148 5149 # If VCF exists 5150 if os.path.exists(output_results_vcf): 5151 5152 # Log 5153 log.debug("Exomiser result VCF update variants") 5154 5155 # Find Exomiser INFO field annotation in header 5156 with gzip.open(output_results_vcf, "rt") as f: 5157 header_list = self.read_vcf_header(f) 5158 exomiser_vcf_header = vcf.Reader( 5159 io.StringIO("\n".join(header_list)) 5160 ) 5161 5162 # Add annotation INFO field to header 5163 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5164 5165 # Update variants with VCF 5166 self.update_from_vcf(output_results_vcf) 5167 5168 return True 5169 5170 def annotation_snpeff(self, threads: int = None) -> None: 5171 """ 5172 This function annotate with snpEff 5173 5174 :param threads: The number of threads to use 5175 :return: the value of the variable "return_value". 5176 """ 5177 5178 # DEBUG 5179 log.debug("Start annotation with snpeff databases") 5180 5181 # Threads 5182 if not threads: 5183 threads = self.get_threads() 5184 log.debug("Threads: " + str(threads)) 5185 5186 # DEBUG 5187 delete_tmp = True 5188 if self.get_config().get("verbosity", "warning") in ["debug"]: 5189 delete_tmp = False 5190 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5191 5192 # Config 5193 config = self.get_config() 5194 log.debug("Config: " + str(config)) 5195 5196 # Config - Folders - Databases 5197 databases_folders = ( 5198 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5199 ) 5200 log.debug("Databases annotations: " + str(databases_folders)) 5201 5202 # Config - snpEff bin command 5203 snpeff_bin_command = get_bin_command( 5204 bin="snpEff.jar", 5205 tool="snpeff", 5206 bin_type="jar", 5207 config=config, 5208 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5209 ) 5210 if not snpeff_bin_command: 5211 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5212 log.error(msg_err) 5213 raise ValueError(msg_err) 5214 5215 # Config - snpEff databases 5216 snpeff_databases = ( 5217 config.get("folders", {}) 5218 .get("databases", {}) 5219 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5220 ) 5221 snpeff_databases = full_path(snpeff_databases) 5222 if snpeff_databases is not None and snpeff_databases != "": 5223 log.debug(f"Create snpEff databases folder") 5224 if not os.path.exists(snpeff_databases): 5225 os.makedirs(snpeff_databases) 5226 5227 # Param 5228 param = self.get_param() 5229 log.debug("Param: " + str(param)) 5230 5231 # Param 5232 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5233 log.debug("Options: " + str(options)) 5234 5235 # Param - Assembly 5236 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5237 5238 # Param - Options 5239 snpeff_options = ( 5240 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5241 ) 5242 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5243 snpeff_csvstats = ( 5244 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5245 ) 5246 if snpeff_stats: 5247 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5248 snpeff_stats = full_path(snpeff_stats) 5249 snpeff_options += f" -stats {snpeff_stats}" 5250 if snpeff_csvstats: 5251 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5252 snpeff_csvstats = full_path(snpeff_csvstats) 5253 snpeff_options += f" -csvStats {snpeff_csvstats}" 5254 5255 # Data 5256 table_variants = self.get_table_variants() 5257 5258 # Check if not empty 5259 log.debug("Check if not empty") 5260 sql_query_chromosomes = ( 5261 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5262 ) 5263 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5264 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5265 log.info(f"VCF empty") 5266 return 5267 5268 # Export in VCF 5269 log.debug("Create initial file to annotate") 5270 tmp_vcf = NamedTemporaryFile( 5271 prefix=self.get_prefix(), 5272 dir=self.get_tmp_dir(), 5273 suffix=".vcf.gz", 5274 delete=True, 5275 ) 5276 tmp_vcf_name = tmp_vcf.name 5277 5278 # VCF header 5279 vcf_reader = self.get_header() 5280 log.debug("Initial header: " + str(vcf_reader.infos)) 5281 5282 # Existing annotations 5283 for vcf_annotation in self.get_header().infos: 5284 5285 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5286 log.debug( 5287 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5288 ) 5289 5290 # Memory limit 5291 # if config.get("memory", None): 5292 # memory_limit = config.get("memory", "8G") 5293 # else: 5294 # memory_limit = "8G" 5295 memory_limit = self.get_memory("8G") 5296 log.debug(f"memory_limit: {memory_limit}") 5297 5298 # snpEff java options 5299 snpeff_java_options = ( 5300 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5301 ) 5302 log.debug(f"Exomiser java options: {snpeff_java_options}") 5303 5304 force_update_annotation = True 5305 5306 if "ANN" not in self.get_header().infos or force_update_annotation: 5307 5308 # Check snpEff database 5309 log.debug(f"Check snpEff databases {[assembly]}") 5310 databases_download_snpeff( 5311 folder=snpeff_databases, assemblies=[assembly], config=config 5312 ) 5313 5314 # Export VCF file 5315 self.export_variant_vcf( 5316 vcf_file=tmp_vcf_name, 5317 remove_info=True, 5318 add_samples=False, 5319 index=True, 5320 ) 5321 5322 # Tmp file 5323 err_files = [] 5324 tmp_annotate_vcf = NamedTemporaryFile( 5325 prefix=self.get_prefix(), 5326 dir=self.get_tmp_dir(), 5327 suffix=".vcf", 5328 delete=False, 5329 ) 5330 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5331 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5332 err_files.append(tmp_annotate_vcf_name_err) 5333 5334 # Command 5335 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5336 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5337 run_parallel_commands([snpeff_command], 1) 5338 5339 # Error messages 5340 log.info(f"Error/Warning messages:") 5341 error_message_command_all = [] 5342 error_message_command_warning = [] 5343 error_message_command_err = [] 5344 for err_file in err_files: 5345 with open(err_file, "r") as f: 5346 for line in f: 5347 message = line.strip() 5348 error_message_command_all.append(message) 5349 if line.startswith("[W::"): 5350 error_message_command_warning.append(message) 5351 if line.startswith("[E::"): 5352 error_message_command_err.append(f"{err_file}: " + message) 5353 # log info 5354 for message in list( 5355 set(error_message_command_err + error_message_command_warning) 5356 ): 5357 log.info(f" {message}") 5358 # debug info 5359 for message in list(set(error_message_command_all)): 5360 log.debug(f" {message}") 5361 # failed 5362 if len(error_message_command_err): 5363 log.error("Annotation failed: Error in commands") 5364 raise ValueError("Annotation failed: Error in commands") 5365 5366 # Find annotation in header 5367 with open(tmp_annotate_vcf_name, "rt") as f: 5368 header_list = self.read_vcf_header(f) 5369 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5370 5371 for ann in annovar_vcf_header.infos: 5372 if ann not in self.get_header().infos: 5373 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5374 5375 # Update variants 5376 log.info(f"Annotation - Updating...") 5377 self.update_from_vcf(tmp_annotate_vcf_name) 5378 5379 else: 5380 if "ANN" in self.get_header().infos: 5381 log.debug(f"Existing snpEff annotations in VCF") 5382 if force_update_annotation: 5383 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5384 5385 def annotation_annovar(self, threads: int = None) -> None: 5386 """ 5387 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5388 annotations 5389 5390 :param threads: number of threads to use 5391 :return: the value of the variable "return_value". 5392 """ 5393 5394 # DEBUG 5395 log.debug("Start annotation with Annovar databases") 5396 5397 # Threads 5398 if not threads: 5399 threads = self.get_threads() 5400 log.debug("Threads: " + str(threads)) 5401 5402 # Tmp en Err files 5403 tmp_files = [] 5404 err_files = [] 5405 5406 # DEBUG 5407 delete_tmp = True 5408 if self.get_config().get("verbosity", "warning") in ["debug"]: 5409 delete_tmp = False 5410 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5411 5412 # Config 5413 config = self.get_config() 5414 log.debug("Config: " + str(config)) 5415 5416 # Config - Folders - Databases 5417 databases_folders = ( 5418 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5419 ) 5420 log.debug("Databases annotations: " + str(databases_folders)) 5421 5422 # Config - annovar bin command 5423 annovar_bin_command = get_bin_command( 5424 bin="table_annovar.pl", 5425 tool="annovar", 5426 bin_type="perl", 5427 config=config, 5428 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5429 ) 5430 if not annovar_bin_command: 5431 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5432 log.error(msg_err) 5433 raise ValueError(msg_err) 5434 5435 # Config - BCFTools bin command 5436 bcftools_bin_command = get_bin_command( 5437 bin="bcftools", 5438 tool="bcftools", 5439 bin_type="bin", 5440 config=config, 5441 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5442 ) 5443 if not bcftools_bin_command: 5444 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5445 log.error(msg_err) 5446 raise ValueError(msg_err) 5447 5448 # Config - annovar databases 5449 annovar_databases = ( 5450 config.get("folders", {}) 5451 .get("databases", {}) 5452 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5453 ) 5454 if annovar_databases is not None: 5455 if isinstance(annovar_databases, list): 5456 annovar_databases = full_path(annovar_databases[0]) 5457 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5458 annovar_databases = full_path(annovar_databases) 5459 if not os.path.exists(annovar_databases): 5460 log.info(f"Annovar databases folder '{annovar_databases}' created") 5461 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5462 else: 5463 msg_err = f"Annovar databases configuration failed" 5464 log.error(msg_err) 5465 raise ValueError(msg_err) 5466 5467 # Param 5468 param = self.get_param() 5469 log.debug("Param: " + str(param)) 5470 5471 # Param - options 5472 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5473 log.debug("Options: " + str(options)) 5474 5475 # Param - annotations 5476 annotations = ( 5477 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5478 ) 5479 log.debug("Annotations: " + str(annotations)) 5480 5481 # Param - Assembly 5482 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5483 5484 # Annovar database assembly 5485 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5486 if annovar_databases_assembly != "" and not os.path.exists( 5487 annovar_databases_assembly 5488 ): 5489 os.makedirs(annovar_databases_assembly) 5490 5491 # Data 5492 table_variants = self.get_table_variants() 5493 5494 # Check if not empty 5495 log.debug("Check if not empty") 5496 sql_query_chromosomes = ( 5497 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5498 ) 5499 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5500 if not sql_query_chromosomes_df["count"][0]: 5501 log.info(f"VCF empty") 5502 return 5503 5504 # VCF header 5505 vcf_reader = self.get_header() 5506 log.debug("Initial header: " + str(vcf_reader.infos)) 5507 5508 # Existing annotations 5509 for vcf_annotation in self.get_header().infos: 5510 5511 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5512 log.debug( 5513 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5514 ) 5515 5516 force_update_annotation = True 5517 5518 if annotations: 5519 5520 commands = [] 5521 tmp_annotates_vcf_name_list = [] 5522 5523 # Export in VCF 5524 log.debug("Create initial file to annotate") 5525 tmp_vcf = NamedTemporaryFile( 5526 prefix=self.get_prefix(), 5527 dir=self.get_tmp_dir(), 5528 suffix=".vcf.gz", 5529 delete=False, 5530 ) 5531 tmp_vcf_name = tmp_vcf.name 5532 tmp_files.append(tmp_vcf_name) 5533 tmp_files.append(tmp_vcf_name + ".tbi") 5534 5535 # Export VCF file 5536 self.export_variant_vcf( 5537 vcf_file=tmp_vcf_name, 5538 remove_info=".", 5539 add_samples=False, 5540 index=True, 5541 ) 5542 5543 # Create file for field rename 5544 log.debug("Create file for field rename") 5545 tmp_rename = NamedTemporaryFile( 5546 prefix=self.get_prefix(), 5547 dir=self.get_tmp_dir(), 5548 suffix=".rename", 5549 delete=False, 5550 ) 5551 tmp_rename_name = tmp_rename.name 5552 tmp_files.append(tmp_rename_name) 5553 5554 # Check Annovar database 5555 log.debug( 5556 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5557 ) 5558 databases_download_annovar( 5559 folder=annovar_databases, 5560 files=list(annotations.keys()), 5561 assemblies=[assembly], 5562 ) 5563 5564 for annotation in annotations: 5565 annotation_fields = annotations[annotation] 5566 5567 if not annotation_fields: 5568 annotation_fields = {"INFO": None} 5569 5570 log.info(f"Annotations Annovar - database '{annotation}'") 5571 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5572 5573 # Tmp file for annovar 5574 err_files = [] 5575 tmp_annotate_vcf_directory = TemporaryDirectory( 5576 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5577 ) 5578 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5579 tmp_annotate_vcf_name_annovar = ( 5580 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5581 ) 5582 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5583 err_files.append(tmp_annotate_vcf_name_err) 5584 tmp_files.append(tmp_annotate_vcf_name_err) 5585 5586 # Tmp file final vcf annotated by annovar 5587 tmp_annotate_vcf = NamedTemporaryFile( 5588 prefix=self.get_prefix(), 5589 dir=self.get_tmp_dir(), 5590 suffix=".vcf.gz", 5591 delete=False, 5592 ) 5593 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5594 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5595 tmp_files.append(tmp_annotate_vcf_name) 5596 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5597 5598 # Number of fields 5599 annotation_list = [] 5600 annotation_renamed_list = [] 5601 5602 for annotation_field in annotation_fields: 5603 5604 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5605 annotation_fields_new_name = annotation_fields.get( 5606 annotation_field, annotation_field 5607 ) 5608 if not annotation_fields_new_name: 5609 annotation_fields_new_name = annotation_field 5610 5611 if ( 5612 force_update_annotation 5613 or annotation_fields_new_name not in self.get_header().infos 5614 ): 5615 annotation_list.append(annotation_field) 5616 annotation_renamed_list.append(annotation_fields_new_name) 5617 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5618 log.warning( 5619 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5620 ) 5621 5622 # Add rename info 5623 run_parallel_commands( 5624 [ 5625 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5626 ], 5627 1, 5628 ) 5629 5630 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5631 log.debug("annotation_list: " + str(annotation_list)) 5632 5633 # protocol 5634 protocol = annotation 5635 5636 # argument 5637 argument = "" 5638 5639 # operation 5640 operation = "f" 5641 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5642 "ensGene" 5643 ): 5644 operation = "g" 5645 if options.get("genebase", None): 5646 argument = f"""'{options.get("genebase","")}'""" 5647 elif annotation in ["cytoBand"]: 5648 operation = "r" 5649 5650 # argument option 5651 argument_option = "" 5652 if argument != "": 5653 argument_option = " --argument " + argument 5654 5655 # command options 5656 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5657 for option in options: 5658 if option not in ["genebase"]: 5659 command_options += f""" --{option}={options[option]}""" 5660 5661 # Command 5662 5663 # Command - Annovar 5664 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5665 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5666 5667 # Command - start pipe 5668 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5669 5670 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5671 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5672 5673 # Command - Special characters (refGene annotation) 5674 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5675 5676 # Command - Clean empty fields (with value ".") 5677 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5678 5679 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5680 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5681 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5682 # for ann in annotation_renamed_list: 5683 for ann in annotation_list: 5684 annovar_fields_to_keep.append(f"^INFO/{ann}") 5685 5686 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5687 5688 # Command - indexing 5689 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5690 5691 log.debug(f"Annotation - Annovar command: {command_annovar}") 5692 run_parallel_commands([command_annovar], 1) 5693 5694 # Error messages 5695 log.info(f"Error/Warning messages:") 5696 error_message_command_all = [] 5697 error_message_command_warning = [] 5698 error_message_command_err = [] 5699 for err_file in err_files: 5700 with open(err_file, "r") as f: 5701 for line in f: 5702 message = line.strip() 5703 error_message_command_all.append(message) 5704 if line.startswith("[W::") or line.startswith("WARNING"): 5705 error_message_command_warning.append(message) 5706 if line.startswith("[E::") or line.startswith("ERROR"): 5707 error_message_command_err.append( 5708 f"{err_file}: " + message 5709 ) 5710 # log info 5711 for message in list( 5712 set(error_message_command_err + error_message_command_warning) 5713 ): 5714 log.info(f" {message}") 5715 # debug info 5716 for message in list(set(error_message_command_all)): 5717 log.debug(f" {message}") 5718 # failed 5719 if len(error_message_command_err): 5720 log.error("Annotation failed: Error in commands") 5721 raise ValueError("Annotation failed: Error in commands") 5722 5723 if tmp_annotates_vcf_name_list: 5724 5725 # List of annotated files 5726 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5727 5728 # Tmp file 5729 tmp_annotate_vcf = NamedTemporaryFile( 5730 prefix=self.get_prefix(), 5731 dir=self.get_tmp_dir(), 5732 suffix=".vcf.gz", 5733 delete=False, 5734 ) 5735 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5736 tmp_files.append(tmp_annotate_vcf_name) 5737 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5738 err_files.append(tmp_annotate_vcf_name_err) 5739 tmp_files.append(tmp_annotate_vcf_name_err) 5740 5741 # Command merge 5742 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5743 log.info( 5744 f"Annotation Annovar - Annotation merging " 5745 + str(len(tmp_annotates_vcf_name_list)) 5746 + " annotated files" 5747 ) 5748 log.debug(f"Annotation - merge command: {merge_command}") 5749 run_parallel_commands([merge_command], 1) 5750 5751 # Find annotation in header 5752 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5753 header_list = self.read_vcf_header(f) 5754 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5755 5756 for ann in annovar_vcf_header.infos: 5757 if ann not in self.get_header().infos: 5758 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5759 5760 # Update variants 5761 log.info(f"Annotation Annovar - Updating...") 5762 self.update_from_vcf(tmp_annotate_vcf_name) 5763 5764 # Clean files 5765 # Tmp file remove command 5766 if True: 5767 tmp_files_remove_command = "" 5768 if tmp_files: 5769 tmp_files_remove_command = " ".join(tmp_files) 5770 clean_command = f" rm -f {tmp_files_remove_command} " 5771 log.debug(f"Annotation Annovar - Annotation cleaning ") 5772 log.debug(f"Annotation - cleaning command: {clean_command}") 5773 run_parallel_commands([clean_command], 1) 5774 5775 # Parquet 5776 def annotation_parquet(self, threads: int = None) -> None: 5777 """ 5778 It takes a VCF file, and annotates it with a parquet file 5779 5780 :param threads: number of threads to use for the annotation 5781 :return: the value of the variable "result". 5782 """ 5783 5784 # DEBUG 5785 log.debug("Start annotation with parquet databases") 5786 5787 # Threads 5788 if not threads: 5789 threads = self.get_threads() 5790 log.debug("Threads: " + str(threads)) 5791 5792 # DEBUG 5793 delete_tmp = True 5794 if self.get_config().get("verbosity", "warning") in ["debug"]: 5795 delete_tmp = False 5796 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5797 5798 # Config 5799 databases_folders = set( 5800 self.get_config() 5801 .get("folders", {}) 5802 .get("databases", {}) 5803 .get("annotations", ["."]) 5804 + self.get_config() 5805 .get("folders", {}) 5806 .get("databases", {}) 5807 .get("parquet", ["."]) 5808 ) 5809 log.debug("Databases annotations: " + str(databases_folders)) 5810 5811 # Param 5812 annotations = ( 5813 self.get_param() 5814 .get("annotation", {}) 5815 .get("parquet", {}) 5816 .get("annotations", None) 5817 ) 5818 log.debug("Annotations: " + str(annotations)) 5819 5820 # Assembly 5821 assembly = self.get_param().get( 5822 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5823 ) 5824 5825 # Force Update Annotation 5826 force_update_annotation = ( 5827 self.get_param() 5828 .get("annotation", {}) 5829 .get("options", {}) 5830 .get("annotations_update", False) 5831 ) 5832 log.debug(f"force_update_annotation={force_update_annotation}") 5833 force_append_annotation = ( 5834 self.get_param() 5835 .get("annotation", {}) 5836 .get("options", {}) 5837 .get("annotations_append", False) 5838 ) 5839 log.debug(f"force_append_annotation={force_append_annotation}") 5840 5841 # Data 5842 table_variants = self.get_table_variants() 5843 5844 # Check if not empty 5845 log.debug("Check if not empty") 5846 sql_query_chromosomes_df = self.get_query_to_df( 5847 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5848 ) 5849 if not sql_query_chromosomes_df["count"][0]: 5850 log.info(f"VCF empty") 5851 return 5852 5853 # VCF header 5854 vcf_reader = self.get_header() 5855 log.debug("Initial header: " + str(vcf_reader.infos)) 5856 5857 # Nb Variants POS 5858 log.debug("NB Variants Start") 5859 nb_variants = self.conn.execute( 5860 f"SELECT count(*) AS count FROM variants" 5861 ).fetchdf()["count"][0] 5862 log.debug("NB Variants Stop") 5863 5864 # Existing annotations 5865 for vcf_annotation in self.get_header().infos: 5866 5867 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5868 log.debug( 5869 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5870 ) 5871 5872 # Added columns 5873 added_columns = [] 5874 5875 # drop indexes 5876 log.debug(f"Drop indexes...") 5877 self.drop_indexes() 5878 5879 if annotations: 5880 5881 if "ALL" in annotations: 5882 5883 all_param = annotations.get("ALL", {}) 5884 all_param_formats = all_param.get("formats", None) 5885 all_param_releases = all_param.get("releases", None) 5886 5887 databases_infos_dict = self.scan_databases( 5888 database_formats=all_param_formats, 5889 database_releases=all_param_releases, 5890 ) 5891 for database_infos in databases_infos_dict.keys(): 5892 if database_infos not in annotations: 5893 annotations[database_infos] = {"INFO": None} 5894 5895 for annotation in annotations: 5896 5897 if annotation in ["ALL"]: 5898 continue 5899 5900 # Annotation Name 5901 annotation_name = os.path.basename(annotation) 5902 5903 # Annotation fields 5904 annotation_fields = annotations[annotation] 5905 if not annotation_fields: 5906 annotation_fields = {"INFO": None} 5907 5908 log.debug(f"Annotation '{annotation_name}'") 5909 log.debug( 5910 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5911 ) 5912 5913 # Create Database 5914 database = Database( 5915 database=annotation, 5916 databases_folders=databases_folders, 5917 assembly=assembly, 5918 ) 5919 5920 # Find files 5921 parquet_file = database.get_database() 5922 parquet_hdr_file = database.get_header_file() 5923 parquet_type = database.get_type() 5924 5925 # Check if files exists 5926 if not parquet_file or not parquet_hdr_file: 5927 msg_err_list = [] 5928 if not parquet_file: 5929 msg_err_list.append( 5930 f"Annotation failed: Annotation file not found" 5931 ) 5932 if parquet_file and not parquet_hdr_file: 5933 msg_err_list.append( 5934 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 5935 ) 5936 5937 log.error(". ".join(msg_err_list)) 5938 raise ValueError(". ".join(msg_err_list)) 5939 else: 5940 # Get parquet connexion 5941 parquet_sql_attach = database.get_sql_database_attach( 5942 output="query" 5943 ) 5944 if parquet_sql_attach: 5945 self.conn.execute(parquet_sql_attach) 5946 parquet_file_link = database.get_sql_database_link() 5947 # Log 5948 log.debug( 5949 f"Annotation '{annotation_name}' - file: " 5950 + str(parquet_file) 5951 + " and " 5952 + str(parquet_hdr_file) 5953 ) 5954 5955 # Database full header columns 5956 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5957 parquet_hdr_file 5958 ) 5959 # Log 5960 log.debug( 5961 "Annotation database header columns : " 5962 + str(parquet_hdr_vcf_header_columns) 5963 ) 5964 5965 # Load header as VCF object 5966 parquet_hdr_vcf_header_infos = database.get_header().infos 5967 # Log 5968 log.debug( 5969 "Annotation database header: " 5970 + str(parquet_hdr_vcf_header_infos) 5971 ) 5972 5973 # Get extra infos 5974 parquet_columns = database.get_extra_columns() 5975 # Log 5976 log.debug("Annotation database Columns: " + str(parquet_columns)) 5977 5978 # Add extra columns if "ALL" in annotation_fields 5979 # if "ALL" in annotation_fields: 5980 # allow_add_extra_column = True 5981 if "ALL" in annotation_fields and database.get_extra_columns(): 5982 for extra_column in database.get_extra_columns(): 5983 if ( 5984 extra_column not in annotation_fields 5985 and extra_column.replace("INFO/", "") 5986 not in parquet_hdr_vcf_header_infos 5987 ): 5988 parquet_hdr_vcf_header_infos[extra_column] = ( 5989 vcf.parser._Info( 5990 extra_column, 5991 ".", 5992 "String", 5993 f"{extra_column} description", 5994 "unknown", 5995 "unknown", 5996 self.code_type_map["String"], 5997 ) 5998 ) 5999 6000 # For all fields in database 6001 annotation_fields_all = False 6002 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6003 annotation_fields_all = True 6004 annotation_fields = { 6005 key: key for key in parquet_hdr_vcf_header_infos 6006 } 6007 6008 log.debug( 6009 "Annotation database header - All annotations added: " 6010 + str(annotation_fields) 6011 ) 6012 6013 # Init 6014 6015 # List of annotation fields to use 6016 sql_query_annotation_update_info_sets = [] 6017 6018 # List of annotation to agregate 6019 sql_query_annotation_to_agregate = [] 6020 6021 # Number of fields 6022 nb_annotation_field = 0 6023 6024 # Annotation fields processed 6025 annotation_fields_processed = [] 6026 6027 # Columns mapping 6028 map_columns = database.map_columns( 6029 columns=annotation_fields, prefixes=["INFO/"] 6030 ) 6031 6032 # Query dict for fields to remove (update option) 6033 query_dict_remove = {} 6034 6035 # Fetch Anotation fields 6036 for annotation_field in annotation_fields: 6037 6038 # annotation_field_column 6039 annotation_field_column = map_columns.get( 6040 annotation_field, "INFO" 6041 ) 6042 6043 # field new name, if parametered 6044 annotation_fields_new_name = annotation_fields.get( 6045 annotation_field, annotation_field 6046 ) 6047 if not annotation_fields_new_name: 6048 annotation_fields_new_name = annotation_field 6049 6050 # To annotate 6051 # force_update_annotation = True 6052 # force_append_annotation = True 6053 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6054 if annotation_field in parquet_hdr_vcf_header_infos and ( 6055 force_update_annotation 6056 or force_append_annotation 6057 or ( 6058 annotation_fields_new_name 6059 not in self.get_header().infos 6060 ) 6061 ): 6062 6063 # Add field to annotation to process list 6064 annotation_fields_processed.append( 6065 annotation_fields_new_name 6066 ) 6067 6068 # explode infos for the field 6069 annotation_fields_new_name_info_msg = "" 6070 if ( 6071 force_update_annotation 6072 and annotation_fields_new_name 6073 in self.get_header().infos 6074 ): 6075 # Remove field from INFO 6076 query = f""" 6077 UPDATE {table_variants} as table_variants 6078 SET INFO = REGEXP_REPLACE( 6079 concat(table_variants.INFO,''), 6080 ';*{annotation_fields_new_name}=[^;]*', 6081 '' 6082 ) 6083 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6084 """ 6085 annotation_fields_new_name_info_msg = " [update]" 6086 query_dict_remove[ 6087 f"remove 'INFO/{annotation_fields_new_name}'" 6088 ] = query 6089 6090 # Sep between fields in INFO 6091 nb_annotation_field += 1 6092 if nb_annotation_field > 1: 6093 annotation_field_sep = ";" 6094 else: 6095 annotation_field_sep = "" 6096 6097 log.info( 6098 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6099 ) 6100 6101 # Add INFO field to header 6102 parquet_hdr_vcf_header_infos_number = ( 6103 parquet_hdr_vcf_header_infos[annotation_field].num 6104 or "." 6105 ) 6106 parquet_hdr_vcf_header_infos_type = ( 6107 parquet_hdr_vcf_header_infos[annotation_field].type 6108 or "String" 6109 ) 6110 parquet_hdr_vcf_header_infos_description = ( 6111 parquet_hdr_vcf_header_infos[annotation_field].desc 6112 or f"{annotation_field} description" 6113 ) 6114 parquet_hdr_vcf_header_infos_source = ( 6115 parquet_hdr_vcf_header_infos[annotation_field].source 6116 or "unknown" 6117 ) 6118 parquet_hdr_vcf_header_infos_version = ( 6119 parquet_hdr_vcf_header_infos[annotation_field].version 6120 or "unknown" 6121 ) 6122 6123 vcf_reader.infos[annotation_fields_new_name] = ( 6124 vcf.parser._Info( 6125 annotation_fields_new_name, 6126 parquet_hdr_vcf_header_infos_number, 6127 parquet_hdr_vcf_header_infos_type, 6128 parquet_hdr_vcf_header_infos_description, 6129 parquet_hdr_vcf_header_infos_source, 6130 parquet_hdr_vcf_header_infos_version, 6131 self.code_type_map[ 6132 parquet_hdr_vcf_header_infos_type 6133 ], 6134 ) 6135 ) 6136 6137 # Append 6138 if force_append_annotation: 6139 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6140 else: 6141 query_case_when_append = "" 6142 6143 # Annotation/Update query fields 6144 # Found in INFO column 6145 if ( 6146 annotation_field_column == "INFO" 6147 and "INFO" in parquet_hdr_vcf_header_columns 6148 ): 6149 sql_query_annotation_update_info_sets.append( 6150 f""" 6151 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6152 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6153 ELSE '' 6154 END 6155 """ 6156 ) 6157 # Found in a specific column 6158 else: 6159 sql_query_annotation_update_info_sets.append( 6160 f""" 6161 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6162 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6163 ELSE '' 6164 END 6165 """ 6166 ) 6167 sql_query_annotation_to_agregate.append( 6168 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6169 ) 6170 6171 # Not to annotate 6172 else: 6173 6174 if force_update_annotation: 6175 annotation_message = "forced" 6176 else: 6177 annotation_message = "skipped" 6178 6179 if annotation_field not in parquet_hdr_vcf_header_infos: 6180 log.warning( 6181 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6182 ) 6183 if annotation_fields_new_name in self.get_header().infos: 6184 log.warning( 6185 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6186 ) 6187 6188 # Check if ALL fields have to be annotated. Thus concat all INFO field 6189 # allow_annotation_full_info = True 6190 allow_annotation_full_info = not force_append_annotation 6191 6192 if parquet_type in ["regions"]: 6193 allow_annotation_full_info = False 6194 6195 if ( 6196 allow_annotation_full_info 6197 and nb_annotation_field == len(annotation_fields) 6198 and annotation_fields_all 6199 and ( 6200 "INFO" in parquet_hdr_vcf_header_columns 6201 and "INFO" in database.get_extra_columns() 6202 ) 6203 ): 6204 log.debug("Column INFO annotation enabled") 6205 sql_query_annotation_update_info_sets = [] 6206 sql_query_annotation_update_info_sets.append( 6207 f" table_parquet.INFO " 6208 ) 6209 6210 if sql_query_annotation_update_info_sets: 6211 6212 # Annotate 6213 log.info(f"Annotation '{annotation_name}' - Annotation...") 6214 6215 # Join query annotation update info sets for SQL 6216 sql_query_annotation_update_info_sets_sql = ",".join( 6217 sql_query_annotation_update_info_sets 6218 ) 6219 6220 # Check chromosomes list (and variants infos) 6221 sql_query_chromosomes = f""" 6222 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6223 FROM {table_variants} as table_variants 6224 GROUP BY table_variants."#CHROM" 6225 ORDER BY table_variants."#CHROM" 6226 """ 6227 sql_query_chromosomes_df = self.conn.execute( 6228 sql_query_chromosomes 6229 ).df() 6230 sql_query_chromosomes_dict = { 6231 entry["CHROM"]: { 6232 "count": entry["count_variants"], 6233 "min": entry["min_variants"], 6234 "max": entry["max_variants"], 6235 } 6236 for index, entry in sql_query_chromosomes_df.iterrows() 6237 } 6238 6239 # Init 6240 nb_of_query = 0 6241 nb_of_variant_annotated = 0 6242 query_dict = query_dict_remove 6243 6244 # for chrom in sql_query_chromosomes_df["CHROM"]: 6245 for chrom in sql_query_chromosomes_dict: 6246 6247 # Number of variant by chromosome 6248 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6249 chrom, {} 6250 ).get("count", 0) 6251 6252 log.debug( 6253 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6254 ) 6255 6256 # Annotation with regions database 6257 if parquet_type in ["regions"]: 6258 sql_query_annotation_from_clause = f""" 6259 FROM ( 6260 SELECT 6261 '{chrom}' AS \"#CHROM\", 6262 table_variants_from.\"POS\" AS \"POS\", 6263 {",".join(sql_query_annotation_to_agregate)} 6264 FROM {table_variants} as table_variants_from 6265 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6266 table_parquet_from."#CHROM" = '{chrom}' 6267 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6268 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 6269 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6270 ) 6271 ) 6272 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6273 GROUP BY table_variants_from.\"POS\" 6274 ) 6275 as table_parquet 6276 """ 6277 6278 sql_query_annotation_where_clause = """ 6279 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6280 AND table_parquet.\"POS\" = table_variants.\"POS\" 6281 """ 6282 6283 # Annotation with variants database 6284 else: 6285 sql_query_annotation_from_clause = f""" 6286 FROM {parquet_file_link} as table_parquet 6287 """ 6288 sql_query_annotation_where_clause = f""" 6289 table_variants."#CHROM" = '{chrom}' 6290 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6291 AND table_parquet.\"POS\" = table_variants.\"POS\" 6292 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6293 AND table_parquet.\"REF\" = table_variants.\"REF\" 6294 """ 6295 6296 # Create update query 6297 sql_query_annotation_chrom_interval_pos = f""" 6298 UPDATE {table_variants} as table_variants 6299 SET INFO = 6300 concat( 6301 CASE WHEN table_variants.INFO NOT IN ('','.') 6302 THEN table_variants.INFO 6303 ELSE '' 6304 END 6305 , 6306 CASE WHEN table_variants.INFO NOT IN ('','.') 6307 AND ( 6308 concat({sql_query_annotation_update_info_sets_sql}) 6309 ) 6310 NOT IN ('','.') 6311 THEN ';' 6312 ELSE '' 6313 END 6314 , 6315 {sql_query_annotation_update_info_sets_sql} 6316 ) 6317 {sql_query_annotation_from_clause} 6318 WHERE {sql_query_annotation_where_clause} 6319 ; 6320 """ 6321 6322 # Add update query to dict 6323 query_dict[ 6324 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6325 ] = sql_query_annotation_chrom_interval_pos 6326 6327 nb_of_query = len(query_dict) 6328 num_query = 0 6329 6330 # SET max_expression_depth TO x 6331 self.conn.execute("SET max_expression_depth TO 10000") 6332 6333 for query_name in query_dict: 6334 query = query_dict[query_name] 6335 num_query += 1 6336 log.info( 6337 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6338 ) 6339 result = self.conn.execute(query) 6340 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6341 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6342 log.info( 6343 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6344 ) 6345 6346 log.info( 6347 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6348 ) 6349 6350 else: 6351 6352 log.info( 6353 f"Annotation '{annotation_name}' - No Annotations available" 6354 ) 6355 6356 log.debug("Final header: " + str(vcf_reader.infos)) 6357 6358 # Remove added columns 6359 for added_column in added_columns: 6360 self.drop_column(column=added_column) 6361 6362 def annotation_splice(self, threads: int = None) -> None: 6363 """ 6364 This function annotate with snpEff 6365 6366 :param threads: The number of threads to use 6367 :return: the value of the variable "return_value". 6368 """ 6369 6370 # DEBUG 6371 log.debug("Start annotation with splice tools") 6372 6373 # Threads 6374 if not threads: 6375 threads = self.get_threads() 6376 log.debug("Threads: " + str(threads)) 6377 6378 # DEBUG 6379 delete_tmp = True 6380 if self.get_config().get("verbosity", "warning") in ["debug"]: 6381 delete_tmp = False 6382 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6383 6384 # Config 6385 config = self.get_config() 6386 log.debug("Config: " + str(config)) 6387 splice_config = config.get("tools", {}).get("splice", {}) 6388 if not splice_config: 6389 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6390 msg_err = "No Splice tool config" 6391 raise ValueError(msg_err) 6392 log.debug(f"splice_config: {splice_config}") 6393 6394 # Config - Folders - Databases 6395 databases_folders = ( 6396 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6397 ) 6398 log.debug("Databases annotations: " + str(databases_folders)) 6399 6400 # Splice docker image 6401 splice_docker_image = splice_config.get("docker").get("image") 6402 6403 # Pull splice image if it's not already there 6404 if not check_docker_image_exists(splice_docker_image): 6405 log.warning( 6406 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6407 ) 6408 try: 6409 command(f"docker pull {splice_config.get('docker').get('image')}") 6410 except subprocess.CalledProcessError: 6411 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6412 log.error(msg_err) 6413 raise ValueError(msg_err) 6414 6415 # Config - splice databases 6416 splice_databases = ( 6417 config.get("folders", {}) 6418 .get("databases", {}) 6419 .get("splice", DEFAULT_SPLICE_FOLDER) 6420 ) 6421 splice_databases = full_path(splice_databases) 6422 6423 # Param 6424 param = self.get_param() 6425 log.debug("Param: " + str(param)) 6426 6427 # Param 6428 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6429 log.debug("Options: " + str(options)) 6430 6431 # Data 6432 table_variants = self.get_table_variants() 6433 6434 # Check if not empty 6435 log.debug("Check if not empty") 6436 sql_query_chromosomes = ( 6437 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6438 ) 6439 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6440 log.info("VCF empty") 6441 return None 6442 6443 # Export in VCF 6444 log.debug("Create initial file to annotate") 6445 6446 # Create output folder / work folder 6447 if options.get("output_folder", ""): 6448 output_folder = options.get("output_folder", "") 6449 if not os.path.exists(output_folder): 6450 Path(output_folder).mkdir(parents=True, exist_ok=True) 6451 else: 6452 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6453 if not os.path.exists(output_folder): 6454 Path(output_folder).mkdir(parents=True, exist_ok=True) 6455 6456 if options.get("workdir", ""): 6457 workdir = options.get("workdir", "") 6458 else: 6459 workdir = "/work" 6460 6461 # Create tmp VCF file 6462 tmp_vcf = NamedTemporaryFile( 6463 prefix=self.get_prefix(), 6464 dir=output_folder, 6465 suffix=".vcf", 6466 delete=False, 6467 ) 6468 tmp_vcf_name = tmp_vcf.name 6469 6470 # VCF header 6471 header = self.get_header() 6472 6473 # Existing annotations 6474 for vcf_annotation in self.get_header().infos: 6475 6476 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6477 log.debug( 6478 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6479 ) 6480 6481 # Memory limit 6482 if config.get("memory", None): 6483 memory_limit = config.get("memory", "8G").upper() 6484 # upper() 6485 else: 6486 memory_limit = "8G" 6487 log.debug(f"memory_limit: {memory_limit}") 6488 6489 # Check number of variants to annotate 6490 where_clause_regex_spliceai = r"SpliceAI_\w+" 6491 where_clause_regex_spip = r"SPiP_\w+" 6492 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6493 df_list_of_variants_to_annotate = self.get_query_to_df( 6494 query=f""" SELECT * FROM variants {where_clause} """ 6495 ) 6496 if len(df_list_of_variants_to_annotate) == 0: 6497 log.warning( 6498 f"No variants to annotate with splice. Variants probably already annotated with splice" 6499 ) 6500 return None 6501 else: 6502 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6503 6504 # Export VCF file 6505 self.export_variant_vcf( 6506 vcf_file=tmp_vcf_name, 6507 remove_info=True, 6508 add_samples=True, 6509 index=False, 6510 where_clause=where_clause, 6511 ) 6512 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6513 if any(value for value in splice_config.values() if value is None): 6514 log.warning("At least one splice config parameter is empty") 6515 # exit annotation_splice 6516 return None 6517 6518 # Params in splice nf 6519 def check_values(dico: dict): 6520 """ 6521 Ensure parameters for NF splice pipeline 6522 """ 6523 for key, val in dico.items(): 6524 if key == "genome": 6525 if any( 6526 assemb in options.get("genome", {}) 6527 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6528 ): 6529 yield f"--{key} hg19" 6530 elif any( 6531 assemb in options.get("genome", {}) 6532 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6533 ): 6534 yield f"--{key} hg38" 6535 elif ( 6536 (isinstance(val, str) and val) 6537 or isinstance(val, int) 6538 or isinstance(val, bool) 6539 ): 6540 yield f"--{key} {val}" 6541 6542 # Genome 6543 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6544 options["genome"] = genome 6545 # NF params 6546 nf_params = [] 6547 # Add options 6548 if options: 6549 log.debug(options) 6550 nf_params = list(check_values(options)) 6551 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6552 else: 6553 log.debug("No NF params provided") 6554 # Add threads 6555 if "threads" not in options.keys(): 6556 nf_params.append(f"--threads {threads}") 6557 # Genome path 6558 genome_path = find_genome( 6559 config.get("folders", {}) 6560 .get("databases", {}) 6561 .get("genomes", DEFAULT_GENOME_FOLDER), 6562 file=f"{genome}.fa", 6563 ) 6564 # Add genome path 6565 if not genome_path: 6566 raise ValueError( 6567 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6568 ) 6569 else: 6570 log.debug(f"Genome: {genome_path}") 6571 nf_params.append(f"--genome_path {genome_path}") 6572 6573 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6574 """ 6575 Setting up updated databases for SPiP and SpliceAI 6576 """ 6577 6578 try: 6579 6580 # SpliceAI assembly transcriptome 6581 spliceai_assembly = os.path.join( 6582 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6583 options.get("genome"), 6584 "transcriptome", 6585 ) 6586 spip_assembly = options.get("genome") 6587 6588 spip = find( 6589 f"transcriptome_{spip_assembly}.RData", 6590 config.get("folders", {}).get("databases", {}).get("spip", {}), 6591 ) 6592 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6593 log.debug(f"SPiP annotations: {spip}") 6594 log.debug(f"SpliceAI annotations: {spliceai}") 6595 if spip and spliceai: 6596 return [ 6597 f"--spip_transcriptome {spip}", 6598 f"--spliceai_transcriptome {spliceai}", 6599 ] 6600 else: 6601 log.warning( 6602 "Can't find splice databases in configuration, use annotations file from image" 6603 ) 6604 except TypeError: 6605 log.warning( 6606 "Can't find splice databases in configuration, use annotations file from image" 6607 ) 6608 return [] 6609 6610 # Add options, check if transcriptome option have already beend provided 6611 if ( 6612 "spip_transcriptome" not in nf_params 6613 and "spliceai_transcriptome" not in nf_params 6614 ): 6615 splice_reference = splice_annotations(options, config) 6616 if splice_reference: 6617 nf_params.extend(splice_reference) 6618 # nf_params.append(f"--output_folder {output_folder}") 6619 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6620 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6621 log.debug(cmd) 6622 splice_config["docker"]["command"] = cmd 6623 6624 # Ensure proxy is set 6625 proxy = [ 6626 f"-e {var}={os.getenv(var)}" 6627 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6628 if os.getenv(var) is not None 6629 ] 6630 docker_cmd = get_bin_command( 6631 tool="splice", 6632 bin_type="docker", 6633 config=config, 6634 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6635 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6636 ) 6637 # print(docker_cmd) 6638 # exit() 6639 # Docker debug 6640 # if splice_config.get("rm_container"): 6641 # rm_container = "--rm" 6642 # else: 6643 # rm_container = "" 6644 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6645 log.debug(docker_cmd) 6646 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6647 log.debug(res.stdout) 6648 if res.stderr: 6649 log.error(res.stderr) 6650 res.check_returncode() 6651 # Update variants 6652 log.info("Annotation - Updating...") 6653 # Test find output vcf 6654 log.debug( 6655 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6656 ) 6657 output_vcf = [] 6658 # Wrong folder to look in 6659 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6660 if ( 6661 files 6662 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6663 ): 6664 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6665 # log.debug(os.listdir(options.get("output_folder"))) 6666 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6667 if not output_vcf: 6668 log.debug( 6669 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6670 ) 6671 else: 6672 # Get new header from annotated vcf 6673 log.debug(f"Initial header: {len(header.infos)} fields") 6674 # Create new header with splice infos 6675 new_vcf = Variants(input=output_vcf[0]) 6676 new_vcf_header = new_vcf.get_header().infos 6677 for keys, infos in new_vcf_header.items(): 6678 if keys not in header.infos.keys(): 6679 header.infos[keys] = infos 6680 log.debug(f"New header: {len(header.infos)} fields") 6681 log.debug(f"Splice tmp output: {output_vcf[0]}") 6682 self.update_from_vcf(output_vcf[0]) 6683 6684 # Remove file 6685 remove_if_exists(output_vcf) 6686 6687 ### 6688 # Prioritization 6689 ### 6690 6691 def get_config_default(self, name: str) -> dict: 6692 """ 6693 The function `get_config_default` returns a dictionary containing default configurations for 6694 various calculations and prioritizations. 6695 6696 :param name: The `get_config_default` function returns a dictionary containing default 6697 configurations for different calculations and prioritizations. The `name` parameter is used to 6698 specify which specific configuration to retrieve from the dictionary 6699 :type name: str 6700 :return: The function `get_config_default` returns a dictionary containing default configuration 6701 settings for different calculations and prioritizations. The specific configuration settings are 6702 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6703 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6704 returned. If there is no match, an empty dictionary is returned. 6705 """ 6706 6707 config_default = { 6708 "calculations": { 6709 "variant_chr_pos_alt_ref": { 6710 "type": "sql", 6711 "name": "variant_chr_pos_alt_ref", 6712 "description": "Create a variant ID with chromosome, position, alt and ref", 6713 "available": False, 6714 "output_column_name": "variant_chr_pos_alt_ref", 6715 "output_column_type": "String", 6716 "output_column_description": "variant ID with chromosome, position, alt and ref", 6717 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6718 "operation_info": True, 6719 }, 6720 "VARTYPE": { 6721 "type": "sql", 6722 "name": "VARTYPE", 6723 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6724 "available": True, 6725 "output_column_name": "VARTYPE", 6726 "output_column_type": "String", 6727 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6728 "operation_query": """ 6729 CASE 6730 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6731 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6732 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6733 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6734 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6735 ELSE 'UNDEFINED' 6736 END 6737 """, 6738 "info_fields": ["SVTYPE"], 6739 "operation_info": True, 6740 }, 6741 "snpeff_hgvs": { 6742 "type": "python", 6743 "name": "snpeff_hgvs", 6744 "description": "HGVS nomenclatures from snpEff annotation", 6745 "available": True, 6746 "function_name": "calculation_extract_snpeff_hgvs", 6747 "function_params": ["snpeff_hgvs", "ANN"], 6748 }, 6749 "snpeff_ann_explode": { 6750 "type": "python", 6751 "name": "snpeff_ann_explode", 6752 "description": "Explode snpEff annotations with uniquify values", 6753 "available": True, 6754 "function_name": "calculation_snpeff_ann_explode", 6755 "function_params": [False, "fields", "snpeff_", "ANN"], 6756 }, 6757 "snpeff_ann_explode_uniquify": { 6758 "type": "python", 6759 "name": "snpeff_ann_explode_uniquify", 6760 "description": "Explode snpEff annotations", 6761 "available": True, 6762 "function_name": "calculation_snpeff_ann_explode", 6763 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6764 }, 6765 "snpeff_ann_explode_json": { 6766 "type": "python", 6767 "name": "snpeff_ann_explode_json", 6768 "description": "Explode snpEff annotations in JSON format", 6769 "available": True, 6770 "function_name": "calculation_snpeff_ann_explode", 6771 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6772 }, 6773 "NOMEN": { 6774 "type": "python", 6775 "name": "NOMEN", 6776 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6777 "available": True, 6778 "function_name": "calculation_extract_nomen", 6779 "function_params": [], 6780 }, 6781 "FINDBYPIPELINE": { 6782 "type": "python", 6783 "name": "FINDBYPIPELINE", 6784 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6785 "available": True, 6786 "function_name": "calculation_find_by_pipeline", 6787 "function_params": ["findbypipeline"], 6788 }, 6789 "FINDBYSAMPLE": { 6790 "type": "python", 6791 "name": "FINDBYSAMPLE", 6792 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6793 "available": True, 6794 "function_name": "calculation_find_by_pipeline", 6795 "function_params": ["findbysample"], 6796 }, 6797 "GENOTYPECONCORDANCE": { 6798 "type": "python", 6799 "name": "GENOTYPECONCORDANCE", 6800 "description": "Concordance of genotype for multi caller VCF", 6801 "available": True, 6802 "function_name": "calculation_genotype_concordance", 6803 "function_params": [], 6804 }, 6805 "BARCODE": { 6806 "type": "python", 6807 "name": "BARCODE", 6808 "description": "BARCODE as VaRank tool", 6809 "available": True, 6810 "function_name": "calculation_barcode", 6811 "function_params": [], 6812 }, 6813 "BARCODEFAMILY": { 6814 "type": "python", 6815 "name": "BARCODEFAMILY", 6816 "description": "BARCODEFAMILY as VaRank tool", 6817 "available": True, 6818 "function_name": "calculation_barcode_family", 6819 "function_params": ["BCF"], 6820 }, 6821 "TRIO": { 6822 "type": "python", 6823 "name": "TRIO", 6824 "description": "Inheritance for a trio family", 6825 "available": True, 6826 "function_name": "calculation_trio", 6827 "function_params": [], 6828 }, 6829 "VAF": { 6830 "type": "python", 6831 "name": "VAF", 6832 "description": "Variant Allele Frequency (VAF) harmonization", 6833 "available": True, 6834 "function_name": "calculation_vaf_normalization", 6835 "function_params": [], 6836 }, 6837 "VAF_stats": { 6838 "type": "python", 6839 "name": "VAF_stats", 6840 "description": "Variant Allele Frequency (VAF) statistics", 6841 "available": True, 6842 "function_name": "calculation_genotype_stats", 6843 "function_params": ["VAF"], 6844 }, 6845 "DP_stats": { 6846 "type": "python", 6847 "name": "DP_stats", 6848 "description": "Depth (DP) statistics", 6849 "available": True, 6850 "function_name": "calculation_genotype_stats", 6851 "function_params": ["DP"], 6852 }, 6853 "variant_id": { 6854 "type": "python", 6855 "name": "variant_id", 6856 "description": "Variant ID generated from variant position and type", 6857 "available": True, 6858 "function_name": "calculation_variant_id", 6859 "function_params": [], 6860 }, 6861 "transcripts_json": { 6862 "type": "python", 6863 "name": "transcripts_json", 6864 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6865 "available": True, 6866 "function_name": "calculation_transcripts_annotation", 6867 "function_params": ["transcripts_json", None], 6868 }, 6869 "transcripts_ann": { 6870 "type": "python", 6871 "name": "transcripts_ann", 6872 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6873 "available": True, 6874 "function_name": "calculation_transcripts_annotation", 6875 "function_params": [None, "transcripts_ann"], 6876 }, 6877 "transcripts_annotations": { 6878 "type": "python", 6879 "name": "transcripts_annotations", 6880 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6881 "available": True, 6882 "function_name": "calculation_transcripts_annotation", 6883 "function_params": [None, None], 6884 }, 6885 "transcripts_prioritization": { 6886 "type": "python", 6887 "name": "transcripts_prioritization", 6888 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6889 "available": True, 6890 "function_name": "calculation_transcripts_prioritization", 6891 "function_params": [], 6892 }, 6893 "transcripts_export": { 6894 "type": "python", 6895 "name": "transcripts_export", 6896 "description": "Export transcripts table/view as a file (using param.json)", 6897 "available": True, 6898 "function_name": "calculation_transcripts_export", 6899 "function_params": [], 6900 }, 6901 }, 6902 "prioritizations": { 6903 "default": { 6904 "ANN2": [ 6905 { 6906 "type": "contains", 6907 "value": "HIGH", 6908 "score": 5, 6909 "flag": "PASS", 6910 "comment": [ 6911 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6912 ], 6913 }, 6914 { 6915 "type": "contains", 6916 "value": "MODERATE", 6917 "score": 3, 6918 "flag": "PASS", 6919 "comment": [ 6920 "A non-disruptive variant that might change protein effectiveness" 6921 ], 6922 }, 6923 { 6924 "type": "contains", 6925 "value": "LOW", 6926 "score": 0, 6927 "flag": "FILTERED", 6928 "comment": [ 6929 "Assumed to be mostly harmless or unlikely to change protein behavior" 6930 ], 6931 }, 6932 { 6933 "type": "contains", 6934 "value": "MODIFIER", 6935 "score": 0, 6936 "flag": "FILTERED", 6937 "comment": [ 6938 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6939 ], 6940 }, 6941 ], 6942 } 6943 }, 6944 } 6945 6946 return config_default.get(name, None) 6947 6948 def get_config_json( 6949 self, name: str, config_dict: dict = {}, config_file: str = None 6950 ) -> dict: 6951 """ 6952 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6953 default values, a dictionary, and a file. 6954 6955 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6956 the name of the configuration. It is used to identify and retrieve the configuration settings 6957 for a specific component or module 6958 :type name: str 6959 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6960 dictionary that allows you to provide additional configuration settings or overrides. When you 6961 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6962 the key is the configuration setting you want to override or 6963 :type config_dict: dict 6964 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6965 specify the path to a configuration file that contains additional settings. If provided, the 6966 function will read the contents of this file and update the configuration dictionary with the 6967 values found in the file, overriding any existing values with the 6968 :type config_file: str 6969 :return: The function `get_config_json` returns a dictionary containing the configuration 6970 settings. 6971 """ 6972 6973 # Create with default prioritizations 6974 config_default = self.get_config_default(name=name) 6975 configuration = config_default 6976 # log.debug(f"configuration={configuration}") 6977 6978 # Replace prioritizations from dict 6979 for config in config_dict: 6980 configuration[config] = config_dict[config] 6981 6982 # Replace prioritizations from file 6983 config_file = full_path(config_file) 6984 if config_file: 6985 if os.path.exists(config_file): 6986 with open(config_file) as config_file_content: 6987 config_file_dict = json.load(config_file_content) 6988 for config in config_file_dict: 6989 configuration[config] = config_file_dict[config] 6990 else: 6991 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6992 log.error(msg_error) 6993 raise ValueError(msg_error) 6994 6995 return configuration 6996 6997 def prioritization( 6998 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 6999 ) -> bool: 7000 """ 7001 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7002 prioritizes variants based on configured profiles and criteria. 7003 7004 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7005 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7006 a table name is provided, the method will prioritize the variants in that specific table 7007 :type table: str 7008 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7009 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7010 provided, the code will use a default prefix value of "PZ" 7011 :type pz_prefix: str 7012 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7013 additional parameters specific to the prioritization process. These parameters can include 7014 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7015 configurations needed for the prioritization of variants in a V 7016 :type pz_param: dict 7017 :return: A boolean value (True) is being returned from the `prioritization` function. 7018 """ 7019 7020 # Config 7021 config = self.get_config() 7022 7023 # Param 7024 param = self.get_param() 7025 7026 # Prioritization param 7027 if pz_param is not None: 7028 prioritization_param = pz_param 7029 else: 7030 prioritization_param = param.get("prioritization", {}) 7031 7032 # Configuration profiles 7033 prioritization_config_file = prioritization_param.get( 7034 "prioritization_config", None 7035 ) 7036 prioritization_config_file = full_path(prioritization_config_file) 7037 prioritizations_config = self.get_config_json( 7038 name="prioritizations", config_file=prioritization_config_file 7039 ) 7040 7041 # Prioritization prefix 7042 pz_prefix_default = "PZ" 7043 if pz_prefix is None: 7044 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7045 7046 # Prioritization options 7047 profiles = prioritization_param.get("profiles", []) 7048 if isinstance(profiles, str): 7049 profiles = profiles.split(",") 7050 pzfields = prioritization_param.get( 7051 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7052 ) 7053 if isinstance(pzfields, str): 7054 pzfields = pzfields.split(",") 7055 default_profile = prioritization_param.get("default_profile", None) 7056 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7057 prioritization_score_mode = prioritization_param.get( 7058 "prioritization_score_mode", "HOWARD" 7059 ) 7060 7061 # Quick Prioritizations 7062 prioritizations = param.get("prioritizations", None) 7063 if prioritizations: 7064 log.info("Quick Prioritization:") 7065 for profile in prioritizations.split(","): 7066 if profile not in profiles: 7067 profiles.append(profile) 7068 log.info(f" {profile}") 7069 7070 # If profile "ALL" provided, all profiles in the config profiles 7071 if "ALL" in profiles: 7072 profiles = list(prioritizations_config.keys()) 7073 7074 for profile in profiles: 7075 if prioritizations_config.get(profile, None): 7076 log.debug(f"Profile '{profile}' configured") 7077 else: 7078 msg_error = f"Profile '{profile}' NOT configured" 7079 log.error(msg_error) 7080 raise ValueError(msg_error) 7081 7082 if profiles: 7083 log.info(f"Prioritization... ") 7084 else: 7085 log.debug(f"No profile defined") 7086 return False 7087 7088 if not default_profile and len(profiles): 7089 default_profile = profiles[0] 7090 7091 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7092 log.debug("Profiles to check: " + str(list(profiles))) 7093 7094 # Variables 7095 if table is not None: 7096 table_variants = table 7097 else: 7098 table_variants = self.get_table_variants(clause="update") 7099 log.debug(f"Table to prioritize: {table_variants}") 7100 7101 # Added columns 7102 added_columns = [] 7103 7104 # Create list of PZfields 7105 # List of PZFields 7106 list_of_pzfields_original = pzfields + [ 7107 pzfield + pzfields_sep + profile 7108 for pzfield in pzfields 7109 for profile in profiles 7110 ] 7111 list_of_pzfields = [] 7112 log.debug(f"{list_of_pzfields_original}") 7113 7114 # Remove existing PZfields to use if exists 7115 for pzfield in list_of_pzfields_original: 7116 if self.get_header().infos.get(pzfield, None) is None: 7117 list_of_pzfields.append(pzfield) 7118 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7119 else: 7120 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7121 7122 if list_of_pzfields: 7123 7124 # Explode Infos prefix 7125 explode_infos_prefix = self.get_explode_infos_prefix() 7126 7127 # PZfields tags description 7128 PZfields_INFOS = { 7129 f"{pz_prefix}Tags": { 7130 "ID": f"{pz_prefix}Tags", 7131 "Number": ".", 7132 "Type": "String", 7133 "Description": "Variant tags based on annotation criteria", 7134 }, 7135 f"{pz_prefix}Score": { 7136 "ID": f"{pz_prefix}Score", 7137 "Number": 1, 7138 "Type": "Integer", 7139 "Description": "Variant score based on annotation criteria", 7140 }, 7141 f"{pz_prefix}Flag": { 7142 "ID": f"{pz_prefix}Flag", 7143 "Number": 1, 7144 "Type": "String", 7145 "Description": "Variant flag based on annotation criteria", 7146 }, 7147 f"{pz_prefix}Comment": { 7148 "ID": f"{pz_prefix}Comment", 7149 "Number": ".", 7150 "Type": "String", 7151 "Description": "Variant comment based on annotation criteria", 7152 }, 7153 f"{pz_prefix}Infos": { 7154 "ID": f"{pz_prefix}Infos", 7155 "Number": ".", 7156 "Type": "String", 7157 "Description": "Variant infos based on annotation criteria", 7158 }, 7159 f"{pz_prefix}Class": { 7160 "ID": f"{pz_prefix}Class", 7161 "Number": ".", 7162 "Type": "String", 7163 "Description": "Variant class based on annotation criteria", 7164 }, 7165 } 7166 7167 # Create INFO fields if not exist 7168 for field in PZfields_INFOS: 7169 field_ID = PZfields_INFOS[field]["ID"] 7170 field_description = PZfields_INFOS[field]["Description"] 7171 if field_ID not in self.get_header().infos and field_ID in pzfields: 7172 field_description = ( 7173 PZfields_INFOS[field]["Description"] 7174 + f", profile {default_profile}" 7175 ) 7176 self.get_header().infos[field_ID] = vcf.parser._Info( 7177 field_ID, 7178 PZfields_INFOS[field]["Number"], 7179 PZfields_INFOS[field]["Type"], 7180 field_description, 7181 "unknown", 7182 "unknown", 7183 code_type_map[PZfields_INFOS[field]["Type"]], 7184 ) 7185 7186 # Create INFO fields if not exist for each profile 7187 for profile in prioritizations_config: 7188 if profile in profiles or profiles == []: 7189 for field in PZfields_INFOS: 7190 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7191 field_description = ( 7192 PZfields_INFOS[field]["Description"] 7193 + f", profile {profile}" 7194 ) 7195 if ( 7196 field_ID not in self.get_header().infos 7197 and field in pzfields 7198 ): 7199 self.get_header().infos[field_ID] = vcf.parser._Info( 7200 field_ID, 7201 PZfields_INFOS[field]["Number"], 7202 PZfields_INFOS[field]["Type"], 7203 field_description, 7204 "unknown", 7205 "unknown", 7206 code_type_map[PZfields_INFOS[field]["Type"]], 7207 ) 7208 7209 # Header 7210 for pzfield in list_of_pzfields: 7211 if re.match(f"{pz_prefix}Score.*", pzfield): 7212 added_column = self.add_column( 7213 table_name=table_variants, 7214 column_name=pzfield, 7215 column_type="INTEGER", 7216 default_value="0", 7217 ) 7218 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7219 added_column = self.add_column( 7220 table_name=table_variants, 7221 column_name=pzfield, 7222 column_type="BOOLEAN", 7223 default_value="1", 7224 ) 7225 elif re.match(f"{pz_prefix}Class.*", pzfield): 7226 added_column = self.add_column( 7227 table_name=table_variants, 7228 column_name=pzfield, 7229 column_type="VARCHAR[]", 7230 default_value="null", 7231 ) 7232 else: 7233 added_column = self.add_column( 7234 table_name=table_variants, 7235 column_name=pzfield, 7236 column_type="STRING", 7237 default_value="''", 7238 ) 7239 added_columns.append(added_column) 7240 7241 # Profiles 7242 if profiles: 7243 7244 # foreach profile in configuration file 7245 for profile in prioritizations_config: 7246 7247 # If profile is asked in param, or ALL are asked (empty profile []) 7248 if profile in profiles or profiles == []: 7249 log.info(f"Profile '{profile}'") 7250 7251 sql_set_info_option = "" 7252 7253 sql_set_info = [] 7254 7255 # PZ fields set 7256 7257 # PZScore 7258 if ( 7259 f"{pz_prefix}Score{pzfields_sep}{profile}" 7260 in list_of_pzfields 7261 ): 7262 sql_set_info.append( 7263 f""" 7264 concat( 7265 '{pz_prefix}Score{pzfields_sep}{profile}=', 7266 {pz_prefix}Score{pzfields_sep}{profile} 7267 ) 7268 """ 7269 ) 7270 if ( 7271 profile == default_profile 7272 and f"{pz_prefix}Score" in list_of_pzfields 7273 ): 7274 sql_set_info.append( 7275 f""" 7276 concat( 7277 '{pz_prefix}Score=', 7278 {pz_prefix}Score{pzfields_sep}{profile} 7279 ) 7280 """ 7281 ) 7282 7283 # PZFlag 7284 if ( 7285 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7286 in list_of_pzfields 7287 ): 7288 sql_set_info.append( 7289 f""" 7290 concat( 7291 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7292 CASE 7293 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7294 THEN 'PASS' 7295 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7296 THEN 'FILTERED' 7297 END 7298 ) 7299 """ 7300 ) 7301 if ( 7302 profile == default_profile 7303 and f"{pz_prefix}Flag" in list_of_pzfields 7304 ): 7305 sql_set_info.append( 7306 f""" 7307 concat( 7308 '{pz_prefix}Flag=', 7309 CASE 7310 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7311 THEN 'PASS' 7312 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7313 THEN 'FILTERED' 7314 END 7315 ) 7316 """ 7317 ) 7318 7319 # PZClass 7320 if ( 7321 f"{pz_prefix}Class{pzfields_sep}{profile}" 7322 in list_of_pzfields 7323 ): 7324 sql_set_info.append( 7325 f""" 7326 concat( 7327 '{pz_prefix}Class{pzfields_sep}{profile}=', 7328 CASE 7329 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7330 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7331 ELSE '.' 7332 END 7333 ) 7334 7335 """ 7336 ) 7337 if ( 7338 profile == default_profile 7339 and f"{pz_prefix}Class" in list_of_pzfields 7340 ): 7341 sql_set_info.append( 7342 f""" 7343 concat( 7344 '{pz_prefix}Class=', 7345 CASE 7346 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7347 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7348 ELSE '.' 7349 END 7350 ) 7351 """ 7352 ) 7353 7354 # PZComment 7355 if ( 7356 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7357 in list_of_pzfields 7358 ): 7359 sql_set_info.append( 7360 f""" 7361 CASE 7362 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7363 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7364 ELSE '' 7365 END 7366 """ 7367 ) 7368 if ( 7369 profile == default_profile 7370 and f"{pz_prefix}Comment" in list_of_pzfields 7371 ): 7372 sql_set_info.append( 7373 f""" 7374 CASE 7375 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7376 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7377 ELSE '' 7378 END 7379 """ 7380 ) 7381 7382 # PZInfos 7383 if ( 7384 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7385 in list_of_pzfields 7386 ): 7387 sql_set_info.append( 7388 f""" 7389 CASE 7390 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7391 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7392 ELSE '' 7393 END 7394 """ 7395 ) 7396 if ( 7397 profile == default_profile 7398 and f"{pz_prefix}Infos" in list_of_pzfields 7399 ): 7400 sql_set_info.append( 7401 f""" 7402 CASE 7403 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7404 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7405 ELSE '' 7406 END 7407 """ 7408 ) 7409 7410 # Merge PZfields 7411 sql_set_info_option = "" 7412 sql_set_sep = "" 7413 for sql_set in sql_set_info: 7414 if sql_set_sep: 7415 sql_set_info_option += f""" 7416 , concat('{sql_set_sep}', {sql_set}) 7417 """ 7418 else: 7419 sql_set_info_option += f""" 7420 , {sql_set} 7421 """ 7422 sql_set_sep = ";" 7423 7424 sql_queries = [] 7425 for annotation in prioritizations_config[profile]: 7426 7427 # skip special sections 7428 if annotation.startswith("_"): 7429 continue 7430 7431 # For each criterions 7432 for criterion in prioritizations_config[profile][ 7433 annotation 7434 ]: 7435 7436 # Criterion mode 7437 criterion_mode = None 7438 if np.any( 7439 np.isin(list(criterion.keys()), ["type", "value"]) 7440 ): 7441 criterion_mode = "operation" 7442 elif np.any( 7443 np.isin(list(criterion.keys()), ["sql", "fields"]) 7444 ): 7445 criterion_mode = "sql" 7446 log.debug(f"Criterion Mode: {criterion_mode}") 7447 7448 # Criterion parameters 7449 criterion_type = criterion.get("type", None) 7450 criterion_value = criterion.get("value", None) 7451 criterion_sql = criterion.get("sql", None) 7452 criterion_fields = criterion.get("fields", None) 7453 criterion_score = criterion.get("score", 0) 7454 criterion_flag = criterion.get("flag", "PASS") 7455 criterion_class = criterion.get("class", None) 7456 criterion_flag_bool = criterion_flag == "PASS" 7457 criterion_comment = ( 7458 ", ".join(criterion.get("comment", [])) 7459 .replace("'", "''") 7460 .replace(";", ",") 7461 .replace("\t", " ") 7462 ) 7463 criterion_infos = ( 7464 str(criterion) 7465 .replace("'", "''") 7466 .replace(";", ",") 7467 .replace("\t", " ") 7468 ) 7469 7470 # SQL 7471 if criterion_sql is not None and isinstance( 7472 criterion_sql, list 7473 ): 7474 criterion_sql = " ".join(criterion_sql) 7475 7476 # Fields and explode 7477 if criterion_fields is None: 7478 criterion_fields = [annotation] 7479 if not isinstance(criterion_fields, list): 7480 criterion_fields = str(criterion_fields).split(",") 7481 7482 # Class 7483 if criterion_class is not None and not isinstance( 7484 criterion_class, list 7485 ): 7486 criterion_class = str(criterion_class).split(",") 7487 7488 for annotation_field in criterion_fields: 7489 7490 # Explode specific annotation 7491 log.debug( 7492 f"Explode annotation '{annotation_field}'" 7493 ) 7494 added_columns += self.explode_infos( 7495 prefix=explode_infos_prefix, 7496 fields=[annotation_field], 7497 table=table_variants, 7498 ) 7499 extra_infos = self.get_extra_infos( 7500 table=table_variants 7501 ) 7502 7503 # Check if annotation field is present 7504 if ( 7505 f"{explode_infos_prefix}{annotation_field}" 7506 not in extra_infos 7507 ): 7508 msq_err = f"Annotation '{annotation_field}' not in data" 7509 log.error(msq_err) 7510 raise ValueError(msq_err) 7511 else: 7512 log.debug( 7513 f"Annotation '{annotation_field}' in data" 7514 ) 7515 7516 sql_set = [] 7517 sql_set_info = [] 7518 7519 # PZ fields set 7520 7521 # PZScore 7522 if ( 7523 f"{pz_prefix}Score{pzfields_sep}{profile}" 7524 in list_of_pzfields 7525 ): 7526 # if prioritization_score_mode == "HOWARD": 7527 # sql_set.append( 7528 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7529 # ) 7530 # VaRank prioritization score mode 7531 if prioritization_score_mode == "VaRank": 7532 sql_set.append( 7533 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7534 ) 7535 # default HOWARD prioritization score mode 7536 else: 7537 sql_set.append( 7538 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7539 ) 7540 7541 # PZFlag 7542 if ( 7543 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7544 in list_of_pzfields 7545 ): 7546 sql_set.append( 7547 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7548 ) 7549 7550 # PZClass 7551 if ( 7552 f"{pz_prefix}Class{pzfields_sep}{profile}" 7553 in list_of_pzfields 7554 and criterion_class is not None 7555 ): 7556 sql_set.append( 7557 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7558 ) 7559 7560 # PZComment 7561 if ( 7562 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7563 in list_of_pzfields 7564 ): 7565 sql_set.append( 7566 f""" 7567 {pz_prefix}Comment{pzfields_sep}{profile} = 7568 concat( 7569 {pz_prefix}Comment{pzfields_sep}{profile}, 7570 CASE 7571 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7572 THEN ', ' 7573 ELSE '' 7574 END, 7575 '{criterion_comment}' 7576 ) 7577 """ 7578 ) 7579 7580 # PZInfos 7581 if ( 7582 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7583 in list_of_pzfields 7584 ): 7585 sql_set.append( 7586 f""" 7587 {pz_prefix}Infos{pzfields_sep}{profile} = 7588 concat( 7589 {pz_prefix}Infos{pzfields_sep}{profile}, 7590 '{criterion_infos}' 7591 ) 7592 """ 7593 ) 7594 sql_set_option = ",".join(sql_set) 7595 7596 # Criterion and comparison 7597 if sql_set_option: 7598 7599 if criterion_mode in ["operation"]: 7600 7601 try: 7602 float(criterion_value) 7603 sql_update = f""" 7604 UPDATE {table_variants} 7605 SET {sql_set_option} 7606 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7607 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7608 """ 7609 except: 7610 contains_option = "" 7611 if criterion_type == "contains": 7612 contains_option = ".*" 7613 sql_update = f""" 7614 UPDATE {table_variants} 7615 SET {sql_set_option} 7616 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7617 """ 7618 sql_queries.append(sql_update) 7619 7620 elif criterion_mode in ["sql"]: 7621 7622 sql_update = f""" 7623 UPDATE {table_variants} 7624 SET {sql_set_option} 7625 WHERE {criterion_sql} 7626 """ 7627 sql_queries.append(sql_update) 7628 7629 else: 7630 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7631 log.error(msg_err) 7632 raise ValueError(msg_err) 7633 7634 else: 7635 log.warning( 7636 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7637 ) 7638 7639 # PZTags 7640 if ( 7641 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7642 in list_of_pzfields 7643 ): 7644 7645 # Create PZFalgs value 7646 pztags_value = "" 7647 pztags_sep_default = "," 7648 pztags_sep = "" 7649 for pzfield in pzfields: 7650 if pzfield not in [f"{pz_prefix}Tags"]: 7651 if ( 7652 f"{pzfield}{pzfields_sep}{profile}" 7653 in list_of_pzfields 7654 ): 7655 if pzfield in [f"{pz_prefix}Flag"]: 7656 pztags_value += f"""{pztags_sep}{pzfield}#', 7657 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7658 THEN 'PASS' 7659 ELSE 'FILTERED' 7660 END, '""" 7661 elif pzfield in [f"{pz_prefix}Class"]: 7662 pztags_value += f"""{pztags_sep}{pzfield}#', 7663 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7664 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7665 ELSE '.' 7666 END, '""" 7667 else: 7668 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7669 pztags_sep = pztags_sep_default 7670 7671 # Add Query update for PZFlags 7672 sql_update_pztags = f""" 7673 UPDATE {table_variants} 7674 SET INFO = concat( 7675 INFO, 7676 CASE WHEN INFO NOT in ('','.') 7677 THEN ';' 7678 ELSE '' 7679 END, 7680 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7681 ) 7682 """ 7683 sql_queries.append(sql_update_pztags) 7684 7685 # Add Query update for PZFlags for default 7686 if profile == default_profile: 7687 sql_update_pztags_default = f""" 7688 UPDATE {table_variants} 7689 SET INFO = concat( 7690 INFO, 7691 ';', 7692 '{pz_prefix}Tags={pztags_value}' 7693 ) 7694 """ 7695 sql_queries.append(sql_update_pztags_default) 7696 7697 log.info(f"""Profile '{profile}' - Prioritization... """) 7698 7699 if sql_queries: 7700 7701 for sql_query in sql_queries: 7702 log.debug( 7703 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7704 ) 7705 self.conn.execute(sql_query) 7706 7707 log.info(f"""Profile '{profile}' - Update... """) 7708 sql_query_update = f""" 7709 UPDATE {table_variants} 7710 SET INFO = 7711 concat( 7712 CASE 7713 WHEN INFO NOT IN ('','.') 7714 THEN concat(INFO, ';') 7715 ELSE '' 7716 END 7717 {sql_set_info_option} 7718 ) 7719 """ 7720 self.conn.execute(sql_query_update) 7721 7722 else: 7723 7724 log.warning(f"No profiles in parameters") 7725 7726 # Remove added columns 7727 for added_column in added_columns: 7728 self.drop_column(column=added_column) 7729 7730 # Explode INFOS fields into table fields 7731 if self.get_explode_infos(): 7732 self.explode_infos( 7733 prefix=self.get_explode_infos_prefix(), 7734 fields=self.get_explode_infos_fields(), 7735 force=True, 7736 ) 7737 7738 return True 7739 7740 ### 7741 # HGVS 7742 ### 7743 7744 def annotation_hgvs(self, threads: int = None) -> None: 7745 """ 7746 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7747 coordinates and alleles. 7748 7749 :param threads: The `threads` parameter is an optional integer that specifies the number of 7750 threads to use for parallel processing. If no value is provided, it will default to the number 7751 of threads obtained from the `get_threads()` method 7752 :type threads: int 7753 """ 7754 7755 # Function for each partition of the Dask Dataframe 7756 def partition_function(partition): 7757 """ 7758 The function `partition_function` applies the `annotation_hgvs_partition` function to 7759 each row of a DataFrame called `partition`. 7760 7761 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7762 to be processed 7763 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7764 the "partition" dataframe along the axis 1. 7765 """ 7766 return partition.apply(annotation_hgvs_partition, axis=1) 7767 7768 def annotation_hgvs_partition(row) -> str: 7769 """ 7770 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7771 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7772 7773 :param row: A dictionary-like object that contains the values for the following keys: 7774 :return: a string that contains the HGVS names associated with the given row of data. 7775 """ 7776 7777 chr = row["CHROM"] 7778 pos = row["POS"] 7779 ref = row["REF"] 7780 alt = row["ALT"] 7781 7782 # Find list of associated transcripts 7783 transcripts_list = list( 7784 polars_conn.execute( 7785 f""" 7786 SELECT transcript 7787 FROM refseq_df 7788 WHERE CHROM='{chr}' 7789 AND POS={pos} 7790 """ 7791 )["transcript"] 7792 ) 7793 7794 # Full HGVS annotation in list 7795 hgvs_full_list = [] 7796 7797 for transcript_name in transcripts_list: 7798 7799 # Transcript 7800 transcript = get_transcript( 7801 transcripts=transcripts, transcript_name=transcript_name 7802 ) 7803 # Exon 7804 if use_exon: 7805 exon = transcript.find_exon_number(pos) 7806 else: 7807 exon = None 7808 # Protein 7809 transcript_protein = None 7810 if use_protein or add_protein or full_format: 7811 transcripts_protein = list( 7812 polars_conn.execute( 7813 f""" 7814 SELECT protein 7815 FROM refseqlink_df 7816 WHERE transcript='{transcript_name}' 7817 LIMIT 1 7818 """ 7819 )["protein"] 7820 ) 7821 if len(transcripts_protein): 7822 transcript_protein = transcripts_protein[0] 7823 7824 # HGVS name 7825 hgvs_name = format_hgvs_name( 7826 chr, 7827 pos, 7828 ref, 7829 alt, 7830 genome=genome, 7831 transcript=transcript, 7832 transcript_protein=transcript_protein, 7833 exon=exon, 7834 use_gene=use_gene, 7835 use_protein=use_protein, 7836 full_format=full_format, 7837 use_version=use_version, 7838 codon_type=codon_type, 7839 ) 7840 hgvs_full_list.append(hgvs_name) 7841 if add_protein and not use_protein and not full_format: 7842 hgvs_name = format_hgvs_name( 7843 chr, 7844 pos, 7845 ref, 7846 alt, 7847 genome=genome, 7848 transcript=transcript, 7849 transcript_protein=transcript_protein, 7850 exon=exon, 7851 use_gene=use_gene, 7852 use_protein=True, 7853 full_format=False, 7854 use_version=use_version, 7855 codon_type=codon_type, 7856 ) 7857 hgvs_full_list.append(hgvs_name) 7858 7859 # Create liste of HGVS annotations 7860 hgvs_full = ",".join(hgvs_full_list) 7861 7862 return hgvs_full 7863 7864 # Polars connexion 7865 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7866 7867 # Config 7868 config = self.get_config() 7869 7870 # Databases 7871 # Genome 7872 databases_genomes_folders = ( 7873 config.get("folders", {}) 7874 .get("databases", {}) 7875 .get("genomes", DEFAULT_GENOME_FOLDER) 7876 ) 7877 databases_genome = ( 7878 config.get("folders", {}).get("databases", {}).get("genomes", "") 7879 ) 7880 # refseq database folder 7881 databases_refseq_folders = ( 7882 config.get("folders", {}) 7883 .get("databases", {}) 7884 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7885 ) 7886 # refseq 7887 databases_refseq = config.get("databases", {}).get("refSeq", None) 7888 # refSeqLink 7889 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7890 7891 # Param 7892 param = self.get_param() 7893 7894 # Quick HGVS 7895 if "hgvs_options" in param and param.get("hgvs_options", ""): 7896 log.info(f"Quick HGVS Annotation:") 7897 if not param.get("hgvs", None): 7898 param["hgvs"] = {} 7899 for option in param.get("hgvs_options", "").split(","): 7900 option_var_val = option.split("=") 7901 option_var = option_var_val[0] 7902 if len(option_var_val) > 1: 7903 option_val = option_var_val[1] 7904 else: 7905 option_val = "True" 7906 if option_val.upper() in ["TRUE"]: 7907 option_val = True 7908 elif option_val.upper() in ["FALSE"]: 7909 option_val = False 7910 log.info(f" {option_var}={option_val}") 7911 param["hgvs"][option_var] = option_val 7912 7913 # Check if HGVS annotation enabled 7914 if "hgvs" in param: 7915 log.info(f"HGVS Annotation... ") 7916 for hgvs_option in param.get("hgvs", {}): 7917 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7918 else: 7919 return 7920 7921 # HGVS Param 7922 param_hgvs = param.get("hgvs", {}) 7923 use_exon = param_hgvs.get("use_exon", False) 7924 use_gene = param_hgvs.get("use_gene", False) 7925 use_protein = param_hgvs.get("use_protein", False) 7926 add_protein = param_hgvs.get("add_protein", False) 7927 full_format = param_hgvs.get("full_format", False) 7928 use_version = param_hgvs.get("use_version", False) 7929 codon_type = param_hgvs.get("codon_type", "3") 7930 7931 # refSseq refSeqLink 7932 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7933 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7934 7935 # Assembly 7936 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7937 7938 # Genome 7939 genome_file = None 7940 if find_genome(databases_genome): 7941 genome_file = find_genome(databases_genome) 7942 else: 7943 genome_file = find_genome( 7944 genome_path=databases_genomes_folders, assembly=assembly 7945 ) 7946 log.debug("Genome: " + str(genome_file)) 7947 7948 # refSseq 7949 refseq_file = find_file_prefix( 7950 input_file=databases_refseq, 7951 prefix="ncbiRefSeq", 7952 folder=databases_refseq_folders, 7953 assembly=assembly, 7954 ) 7955 log.debug("refSeq: " + str(refseq_file)) 7956 7957 # refSeqLink 7958 refseqlink_file = find_file_prefix( 7959 input_file=databases_refseqlink, 7960 prefix="ncbiRefSeqLink", 7961 folder=databases_refseq_folders, 7962 assembly=assembly, 7963 ) 7964 log.debug("refSeqLink: " + str(refseqlink_file)) 7965 7966 # Threads 7967 if not threads: 7968 threads = self.get_threads() 7969 log.debug("Threads: " + str(threads)) 7970 7971 # Variables 7972 table_variants = self.get_table_variants(clause="update") 7973 7974 # Get variants SNV and InDel only 7975 query_variants = f""" 7976 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7977 FROM {table_variants} 7978 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7979 """ 7980 df_variants = self.get_query_to_df(query_variants) 7981 7982 # Added columns 7983 added_columns = [] 7984 7985 # Add hgvs column in variants table 7986 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7987 added_column = self.add_column( 7988 table_variants, hgvs_column_name, "STRING", default_value=None 7989 ) 7990 added_columns.append(added_column) 7991 7992 log.debug(f"refSeq loading...") 7993 # refSeq in duckDB 7994 refseq_table = get_refseq_table( 7995 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7996 ) 7997 # Loading all refSeq in Dataframe 7998 refseq_query = f""" 7999 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8000 FROM {refseq_table} 8001 JOIN df_variants ON ( 8002 {refseq_table}.chrom = df_variants.CHROM 8003 AND {refseq_table}.txStart<=df_variants.POS 8004 AND {refseq_table}.txEnd>=df_variants.POS 8005 ) 8006 """ 8007 refseq_df = self.conn.query(refseq_query).pl() 8008 8009 if refseqlink_file: 8010 log.debug(f"refSeqLink loading...") 8011 # refSeqLink in duckDB 8012 refseqlink_table = get_refseq_table( 8013 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8014 ) 8015 # Loading all refSeqLink in Dataframe 8016 protacc_column = "protAcc_with_ver" 8017 mrnaacc_column = "mrnaAcc_with_ver" 8018 refseqlink_query = f""" 8019 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8020 FROM {refseqlink_table} 8021 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8022 WHERE protAcc_without_ver IS NOT NULL 8023 """ 8024 # Polars Dataframe 8025 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8026 8027 # Read RefSeq transcripts into a python dict/model. 8028 log.debug(f"Transcripts loading...") 8029 with tempfile.TemporaryDirectory() as tmpdir: 8030 transcripts_query = f""" 8031 COPY ( 8032 SELECT {refseq_table}.* 8033 FROM {refseq_table} 8034 JOIN df_variants ON ( 8035 {refseq_table}.chrom=df_variants.CHROM 8036 AND {refseq_table}.txStart<=df_variants.POS 8037 AND {refseq_table}.txEnd>=df_variants.POS 8038 ) 8039 ) 8040 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8041 """ 8042 self.conn.query(transcripts_query) 8043 with open(f"{tmpdir}/transcript.tsv") as infile: 8044 transcripts = read_transcripts(infile) 8045 8046 # Polars connexion 8047 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8048 8049 log.debug("Genome loading...") 8050 # Read genome sequence using pyfaidx. 8051 genome = Fasta(genome_file) 8052 8053 log.debug("Start annotation HGVS...") 8054 8055 # Create 8056 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8057 ddf = dd.from_pandas(df_variants, npartitions=threads) 8058 8059 # Use dask.dataframe.apply() to apply function on each partition 8060 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8061 8062 # Convert Dask DataFrame to Pandas Dataframe 8063 df = ddf.compute() 8064 8065 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8066 with tempfile.TemporaryDirectory() as tmpdir: 8067 df_parquet = os.path.join(tmpdir, "df.parquet") 8068 df.to_parquet(df_parquet) 8069 8070 # Update hgvs column 8071 update_variant_query = f""" 8072 UPDATE {table_variants} 8073 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8074 FROM read_parquet('{df_parquet}') as df 8075 WHERE variants."#CHROM" = df.CHROM 8076 AND variants.POS = df.POS 8077 AND variants.REF = df.REF 8078 AND variants.ALT = df.ALT 8079 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8080 """ 8081 self.execute_query(update_variant_query) 8082 8083 # Update INFO column 8084 sql_query_update = f""" 8085 UPDATE {table_variants} 8086 SET INFO = 8087 concat( 8088 CASE 8089 WHEN INFO NOT IN ('','.') 8090 THEN concat(INFO, ';') 8091 ELSE '' 8092 END, 8093 'hgvs=', 8094 {hgvs_column_name} 8095 ) 8096 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8097 """ 8098 self.execute_query(sql_query_update) 8099 8100 # Add header 8101 HGVS_INFOS = { 8102 "hgvs": { 8103 "ID": "hgvs", 8104 "Number": ".", 8105 "Type": "String", 8106 "Description": f"HGVS annotatation with HOWARD", 8107 } 8108 } 8109 8110 for field in HGVS_INFOS: 8111 field_ID = HGVS_INFOS[field]["ID"] 8112 field_description = HGVS_INFOS[field]["Description"] 8113 self.get_header().infos[field_ID] = vcf.parser._Info( 8114 field_ID, 8115 HGVS_INFOS[field]["Number"], 8116 HGVS_INFOS[field]["Type"], 8117 field_description, 8118 "unknown", 8119 "unknown", 8120 code_type_map[HGVS_INFOS[field]["Type"]], 8121 ) 8122 8123 # Remove added columns 8124 for added_column in added_columns: 8125 self.drop_column(column=added_column) 8126 8127 ### 8128 # Calculation 8129 ### 8130 8131 def get_operations_help( 8132 self, operations_config_dict: dict = {}, operations_config_file: str = None 8133 ) -> list: 8134 8135 # Init 8136 operations_help = [] 8137 8138 # operations 8139 operations = self.get_config_json( 8140 name="calculations", 8141 config_dict=operations_config_dict, 8142 config_file=operations_config_file, 8143 ) 8144 for op in operations: 8145 op_name = operations[op].get("name", op).upper() 8146 op_description = operations[op].get("description", op_name) 8147 op_available = operations[op].get("available", False) 8148 if op_available: 8149 operations_help.append(f" {op_name}: {op_description}") 8150 8151 # Sort operations 8152 operations_help.sort() 8153 8154 # insert header 8155 operations_help.insert(0, "Available calculation operations:") 8156 8157 # Return 8158 return operations_help 8159 8160 def calculation( 8161 self, 8162 operations: dict = {}, 8163 operations_config_dict: dict = {}, 8164 operations_config_file: str = None, 8165 ) -> None: 8166 """ 8167 It takes a list of operations, and for each operation, it checks if it's a python or sql 8168 operation, and then calls the appropriate function 8169 8170 param json example: 8171 "calculation": { 8172 "NOMEN": { 8173 "options": { 8174 "hgvs_field": "hgvs" 8175 }, 8176 "middle" : null 8177 } 8178 """ 8179 8180 # Param 8181 param = self.get_param() 8182 8183 # operations config 8184 operations_config = self.get_config_json( 8185 name="calculations", 8186 config_dict=operations_config_dict, 8187 config_file=operations_config_file, 8188 ) 8189 8190 # Upper keys 8191 operations_config = {k.upper(): v for k, v in operations_config.items()} 8192 8193 # Calculations 8194 8195 # Operations from param 8196 operations = param.get("calculation", {}).get("calculations", operations) 8197 8198 # Quick calculation - add 8199 if param.get("calculations", None): 8200 8201 # List of operations 8202 calculations_list = [ 8203 value.strip() for value in param.get("calculations", "").split(",") 8204 ] 8205 8206 # Log 8207 log.info(f"Quick Calculations:") 8208 for calculation_key in calculations_list: 8209 log.info(f" {calculation_key}") 8210 8211 # Create tmp operations (to keep operation order) 8212 operations_tmp = {} 8213 for calculation_operation in calculations_list: 8214 if calculation_operation.upper() not in operations_tmp: 8215 log.debug( 8216 f"{calculation_operation}.upper() not in {operations_tmp}" 8217 ) 8218 operations_tmp[calculation_operation.upper()] = {} 8219 add_value_into_dict( 8220 dict_tree=operations_tmp, 8221 sections=[ 8222 calculation_operation.upper(), 8223 ], 8224 value=operations.get(calculation_operation.upper(), {}), 8225 ) 8226 # Add operations already in param 8227 for calculation_operation in operations: 8228 if calculation_operation not in operations_tmp: 8229 operations_tmp[calculation_operation] = operations.get( 8230 calculation_operation, {} 8231 ) 8232 8233 # Update operations in param 8234 operations = operations_tmp 8235 8236 # Operations for calculation 8237 if not operations: 8238 operations = param.get("calculation", {}).get("calculations", {}) 8239 8240 if operations: 8241 log.info(f"Calculations...") 8242 8243 # For each operations 8244 for operation_name in operations: 8245 operation_name = operation_name.upper() 8246 if operation_name not in [""]: 8247 if operation_name in operations_config: 8248 log.info(f"Calculation '{operation_name}'") 8249 operation = operations_config[operation_name] 8250 operation_type = operation.get("type", "sql") 8251 if operation_type == "python": 8252 self.calculation_process_function( 8253 operation=operation, operation_name=operation_name 8254 ) 8255 elif operation_type == "sql": 8256 self.calculation_process_sql( 8257 operation=operation, operation_name=operation_name 8258 ) 8259 else: 8260 log.error( 8261 f"Operations config: Type '{operation_type}' NOT available" 8262 ) 8263 raise ValueError( 8264 f"Operations config: Type '{operation_type}' NOT available" 8265 ) 8266 else: 8267 log.error( 8268 f"Operations config: Calculation '{operation_name}' NOT available" 8269 ) 8270 raise ValueError( 8271 f"Operations config: Calculation '{operation_name}' NOT available" 8272 ) 8273 8274 # Explode INFOS fields into table fields 8275 if self.get_explode_infos(): 8276 self.explode_infos( 8277 prefix=self.get_explode_infos_prefix(), 8278 fields=self.get_explode_infos_fields(), 8279 force=True, 8280 ) 8281 8282 def calculation_process_sql( 8283 self, operation: dict, operation_name: str = "unknown" 8284 ) -> None: 8285 """ 8286 The `calculation_process_sql` function takes in a mathematical operation as a string and 8287 performs the operation, updating the specified table with the result. 8288 8289 :param operation: The `operation` parameter is a dictionary that contains information about the 8290 mathematical operation to be performed. It includes the following keys: 8291 :type operation: dict 8292 :param operation_name: The `operation_name` parameter is a string that represents the name of 8293 the mathematical operation being performed. It is used for logging and error handling purposes, 8294 defaults to unknown 8295 :type operation_name: str (optional) 8296 """ 8297 8298 # table variants 8299 table_variants = self.get_table_variants(clause="alter") 8300 8301 # Operation infos 8302 operation_name = operation.get("name", "unknown") 8303 log.debug(f"process sql {operation_name}") 8304 output_column_name = operation.get("output_column_name", operation_name) 8305 output_column_type = operation.get("output_column_type", "String") 8306 prefix = operation.get("explode_infos_prefix", "") 8307 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8308 output_column_description = operation.get( 8309 "output_column_description", f"{operation_name} operation" 8310 ) 8311 operation_query = operation.get("operation_query", None) 8312 if isinstance(operation_query, list): 8313 operation_query = " ".join(operation_query) 8314 operation_info_fields = operation.get("info_fields", []) 8315 operation_info_fields_check = operation.get("info_fields_check", False) 8316 operation_info = operation.get("operation_info", True) 8317 8318 if operation_query: 8319 8320 # Info fields check 8321 operation_info_fields_check_result = True 8322 if operation_info_fields_check: 8323 header_infos = self.get_header().infos 8324 for info_field in operation_info_fields: 8325 operation_info_fields_check_result = ( 8326 operation_info_fields_check_result 8327 and info_field in header_infos 8328 ) 8329 8330 # If info fields available 8331 if operation_info_fields_check_result: 8332 8333 # Added_columns 8334 added_columns = [] 8335 8336 # Create VCF header field 8337 vcf_reader = self.get_header() 8338 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8339 output_column_name, 8340 ".", 8341 output_column_type, 8342 output_column_description, 8343 "howard calculation", 8344 "0", 8345 self.code_type_map.get(output_column_type), 8346 ) 8347 8348 # Explode infos if needed 8349 log.debug(f"calculation_process_sql prefix {prefix}") 8350 added_columns += self.explode_infos( 8351 prefix=prefix, 8352 fields=[output_column_name] + operation_info_fields, 8353 force=True, 8354 ) 8355 8356 # Create column 8357 added_column = self.add_column( 8358 table_name=table_variants, 8359 column_name=prefix + output_column_name, 8360 column_type=output_column_type_sql, 8361 default_value="null", 8362 ) 8363 added_columns.append(added_column) 8364 8365 # Operation calculation 8366 try: 8367 8368 # Query to update calculation column 8369 sql_update = f""" 8370 UPDATE {table_variants} 8371 SET "{prefix}{output_column_name}" = ({operation_query}) 8372 """ 8373 self.conn.execute(sql_update) 8374 8375 # Add to INFO 8376 if operation_info: 8377 sql_update_info = f""" 8378 UPDATE {table_variants} 8379 SET "INFO" = 8380 concat( 8381 CASE 8382 WHEN "INFO" IS NOT NULL 8383 THEN concat("INFO", ';') 8384 ELSE '' 8385 END, 8386 '{output_column_name}=', 8387 "{prefix}{output_column_name}" 8388 ) 8389 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8390 """ 8391 self.conn.execute(sql_update_info) 8392 8393 except: 8394 log.error( 8395 f"Operations config: Calculation '{operation_name}' query failed" 8396 ) 8397 raise ValueError( 8398 f"Operations config: Calculation '{operation_name}' query failed" 8399 ) 8400 8401 # Remove added columns 8402 for added_column in added_columns: 8403 log.debug(f"added_column: {added_column}") 8404 self.drop_column(column=added_column) 8405 8406 else: 8407 log.error( 8408 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8409 ) 8410 raise ValueError( 8411 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8412 ) 8413 8414 else: 8415 log.error( 8416 f"Operations config: Calculation '{operation_name}' query NOT defined" 8417 ) 8418 raise ValueError( 8419 f"Operations config: Calculation '{operation_name}' query NOT defined" 8420 ) 8421 8422 def calculation_process_function( 8423 self, operation: dict, operation_name: str = "unknown" 8424 ) -> None: 8425 """ 8426 The `calculation_process_function` takes in an operation dictionary and performs the specified 8427 function with the given parameters. 8428 8429 :param operation: The `operation` parameter is a dictionary that contains information about the 8430 operation to be performed. It has the following keys: 8431 :type operation: dict 8432 :param operation_name: The `operation_name` parameter is a string that represents the name of 8433 the operation being performed. It is used for logging purposes, defaults to unknown 8434 :type operation_name: str (optional) 8435 """ 8436 8437 operation_name = operation["name"] 8438 log.debug(f"process sql {operation_name}") 8439 function_name = operation["function_name"] 8440 function_params = operation["function_params"] 8441 getattr(self, function_name)(*function_params) 8442 8443 def calculation_variant_id(self) -> None: 8444 """ 8445 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8446 updates the INFO field of a variants table with the variant ID. 8447 """ 8448 8449 # variant_id annotation field 8450 variant_id_tag = self.get_variant_id_column() 8451 added_columns = [variant_id_tag] 8452 8453 # variant_id hgvs tags" 8454 vcf_infos_tags = { 8455 variant_id_tag: "howard variant ID annotation", 8456 } 8457 8458 # Variants table 8459 table_variants = self.get_table_variants() 8460 8461 # Header 8462 vcf_reader = self.get_header() 8463 8464 # Add variant_id to header 8465 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8466 variant_id_tag, 8467 ".", 8468 "String", 8469 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8470 "howard calculation", 8471 "0", 8472 self.code_type_map.get("String"), 8473 ) 8474 8475 # Update 8476 sql_update = f""" 8477 UPDATE {table_variants} 8478 SET "INFO" = 8479 concat( 8480 CASE 8481 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8482 THEN '' 8483 ELSE concat("INFO", ';') 8484 END, 8485 '{variant_id_tag}=', 8486 "{variant_id_tag}" 8487 ) 8488 """ 8489 self.conn.execute(sql_update) 8490 8491 # Remove added columns 8492 for added_column in added_columns: 8493 self.drop_column(column=added_column) 8494 8495 def calculation_extract_snpeff_hgvs( 8496 self, 8497 snpeff_hgvs: str = "snpeff_hgvs", 8498 snpeff_field: str = "ANN", 8499 ) -> None: 8500 """ 8501 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8502 annotation field in a VCF file and adds them as a new column in the variants table. 8503 8504 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8505 function is used to specify the name of the column that will store the HGVS nomenclatures 8506 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8507 snpeff_hgvs 8508 :type snpeff_hgvs: str (optional) 8509 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8510 function represents the field in the VCF file that contains SnpEff annotations. This field is 8511 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8512 to ANN 8513 :type snpeff_field: str (optional) 8514 """ 8515 8516 # Snpeff hgvs tags 8517 vcf_infos_tags = { 8518 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8519 } 8520 8521 # Prefix 8522 prefix = self.get_explode_infos_prefix() 8523 if prefix: 8524 prefix = "INFO/" 8525 8526 # snpEff fields 8527 speff_ann_infos = prefix + snpeff_field 8528 speff_hgvs_infos = prefix + snpeff_hgvs 8529 8530 # Variants table 8531 table_variants = self.get_table_variants() 8532 8533 # Header 8534 vcf_reader = self.get_header() 8535 8536 # Add columns 8537 added_columns = [] 8538 8539 # Explode HGVS field in column 8540 added_columns += self.explode_infos(fields=[snpeff_field]) 8541 8542 if snpeff_field in vcf_reader.infos: 8543 8544 log.debug(vcf_reader.infos[snpeff_field]) 8545 8546 # Extract ANN header 8547 ann_description = vcf_reader.infos[snpeff_field].desc 8548 pattern = r"'(.+?)'" 8549 match = re.search(pattern, ann_description) 8550 if match: 8551 ann_header_match = match.group(1).split(" | ") 8552 ann_header_desc = {} 8553 for i in range(len(ann_header_match)): 8554 ann_header_info = "".join( 8555 char for char in ann_header_match[i] if char.isalnum() 8556 ) 8557 ann_header_desc[ann_header_info] = ann_header_match[i] 8558 if not ann_header_desc: 8559 raise ValueError("Invalid header description format") 8560 else: 8561 raise ValueError("Invalid header description format") 8562 8563 # Create variant id 8564 variant_id_column = self.get_variant_id_column() 8565 added_columns += [variant_id_column] 8566 8567 # Create dataframe 8568 dataframe_snpeff_hgvs = self.get_query_to_df( 8569 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8570 ) 8571 8572 # Create main NOMEN column 8573 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8574 speff_ann_infos 8575 ].apply( 8576 lambda x: extract_snpeff_hgvs( 8577 str(x), header=list(ann_header_desc.values()) 8578 ) 8579 ) 8580 8581 # Add snpeff_hgvs to header 8582 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8583 snpeff_hgvs, 8584 ".", 8585 "String", 8586 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8587 "howard calculation", 8588 "0", 8589 self.code_type_map.get("String"), 8590 ) 8591 8592 # Update 8593 sql_update = f""" 8594 UPDATE variants 8595 SET "INFO" = 8596 concat( 8597 CASE 8598 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8599 THEN '' 8600 ELSE concat("INFO", ';') 8601 END, 8602 CASE 8603 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8604 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8605 THEN concat( 8606 '{snpeff_hgvs}=', 8607 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8608 ) 8609 ELSE '' 8610 END 8611 ) 8612 FROM dataframe_snpeff_hgvs 8613 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8614 8615 """ 8616 self.conn.execute(sql_update) 8617 8618 # Delete dataframe 8619 del dataframe_snpeff_hgvs 8620 gc.collect() 8621 8622 else: 8623 8624 log.warning( 8625 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8626 ) 8627 8628 # Remove added columns 8629 for added_column in added_columns: 8630 self.drop_column(column=added_column) 8631 8632 def calculation_snpeff_ann_explode( 8633 self, 8634 uniquify: bool = True, 8635 output_format: str = "fields", 8636 output_prefix: str = "snpeff_", 8637 snpeff_field: str = "ANN", 8638 ) -> None: 8639 """ 8640 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8641 exploding the HGVS field and updating variant information accordingly. 8642 8643 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8644 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8645 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8646 defaults to True 8647 :type uniquify: bool (optional) 8648 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8649 function specifies the format in which the output annotations will be generated. It has a 8650 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8651 format, defaults to fields 8652 :type output_format: str (optional) 8653 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8654 method is used to specify the prefix that will be added to the output annotations generated 8655 during the calculation process. This prefix helps to differentiate the newly added annotations 8656 from existing ones in the output data. By default, the, defaults to ANN_ 8657 :type output_prefix: str (optional) 8658 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8659 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8660 field will be processed to explode the HGVS annotations and update the variant information 8661 accordingly, defaults to ANN 8662 :type snpeff_field: str (optional) 8663 """ 8664 8665 # SnpEff annotation field 8666 snpeff_hgvs = "snpeff_ann_explode" 8667 8668 # Snpeff hgvs tags 8669 vcf_infos_tags = { 8670 snpeff_hgvs: "Explode snpEff annotations", 8671 } 8672 8673 # Prefix 8674 prefix = self.get_explode_infos_prefix() 8675 if prefix: 8676 prefix = "INFO/" 8677 8678 # snpEff fields 8679 speff_ann_infos = prefix + snpeff_field 8680 speff_hgvs_infos = prefix + snpeff_hgvs 8681 8682 # Variants table 8683 table_variants = self.get_table_variants() 8684 8685 # Header 8686 vcf_reader = self.get_header() 8687 8688 # Add columns 8689 added_columns = [] 8690 8691 # Explode HGVS field in column 8692 added_columns += self.explode_infos(fields=[snpeff_field]) 8693 log.debug(f"snpeff_field={snpeff_field}") 8694 log.debug(f"added_columns={added_columns}") 8695 8696 if snpeff_field in vcf_reader.infos: 8697 8698 # Extract ANN header 8699 ann_description = vcf_reader.infos[snpeff_field].desc 8700 pattern = r"'(.+?)'" 8701 match = re.search(pattern, ann_description) 8702 if match: 8703 ann_header_match = match.group(1).split(" | ") 8704 ann_header = [] 8705 ann_header_desc = {} 8706 for i in range(len(ann_header_match)): 8707 ann_header_info = "".join( 8708 char for char in ann_header_match[i] if char.isalnum() 8709 ) 8710 ann_header.append(ann_header_info) 8711 ann_header_desc[ann_header_info] = ann_header_match[i] 8712 if not ann_header_desc: 8713 raise ValueError("Invalid header description format") 8714 else: 8715 raise ValueError("Invalid header description format") 8716 8717 # Create variant id 8718 variant_id_column = self.get_variant_id_column() 8719 added_columns += [variant_id_column] 8720 8721 # Create dataframe 8722 dataframe_snpeff_hgvs = self.get_query_to_df( 8723 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8724 ) 8725 8726 # Create snpEff columns 8727 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8728 speff_ann_infos 8729 ].apply( 8730 lambda x: explode_snpeff_ann( 8731 str(x), 8732 uniquify=uniquify, 8733 output_format=output_format, 8734 prefix=output_prefix, 8735 header=list(ann_header_desc.values()), 8736 ) 8737 ) 8738 8739 # Header 8740 ann_annotations_prefix = "" 8741 if output_format.upper() in ["JSON"]: 8742 ann_annotations_prefix = f"{output_prefix}=" 8743 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8744 output_prefix, 8745 ".", 8746 "String", 8747 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8748 + " - JSON format", 8749 "howard calculation", 8750 "0", 8751 self.code_type_map.get("String"), 8752 ) 8753 else: 8754 for ann_annotation in ann_header: 8755 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8756 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8757 ann_annotation_id, 8758 ".", 8759 "String", 8760 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8761 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8762 "howard calculation", 8763 "0", 8764 self.code_type_map.get("String"), 8765 ) 8766 8767 # Update 8768 sql_update = f""" 8769 UPDATE variants 8770 SET "INFO" = 8771 concat( 8772 CASE 8773 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8774 THEN '' 8775 ELSE concat("INFO", ';') 8776 END, 8777 CASE 8778 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8779 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8780 THEN concat( 8781 '{ann_annotations_prefix}', 8782 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8783 ) 8784 ELSE '' 8785 END 8786 ) 8787 FROM dataframe_snpeff_hgvs 8788 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8789 8790 """ 8791 self.conn.execute(sql_update) 8792 8793 # Delete dataframe 8794 del dataframe_snpeff_hgvs 8795 gc.collect() 8796 8797 else: 8798 8799 log.warning( 8800 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8801 ) 8802 8803 # Remove added columns 8804 for added_column in added_columns: 8805 self.drop_column(column=added_column) 8806 8807 def calculation_extract_nomen(self) -> None: 8808 """ 8809 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8810 """ 8811 8812 # NOMEN field 8813 field_nomen_dict = "NOMEN_DICT" 8814 8815 # NOMEN structure 8816 nomen_dict = { 8817 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8818 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8819 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8820 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8821 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8822 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8823 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8824 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8825 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8826 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8827 } 8828 8829 # Param 8830 param = self.get_param() 8831 8832 # Prefix 8833 prefix = self.get_explode_infos_prefix() 8834 8835 # Header 8836 vcf_reader = self.get_header() 8837 8838 # Added columns 8839 added_columns = [] 8840 8841 # Get HGVS field 8842 hgvs_field = ( 8843 param.get("calculation", {}) 8844 .get("calculations", {}) 8845 .get("NOMEN", {}) 8846 .get("options", {}) 8847 .get("hgvs_field", "hgvs") 8848 ) 8849 8850 # Get NOMEN pattern 8851 nomen_pattern = ( 8852 param.get("calculation", {}) 8853 .get("calculations", {}) 8854 .get("NOMEN", {}) 8855 .get("options", {}) 8856 .get("pattern", None) 8857 ) 8858 8859 # transcripts list of preference sources 8860 transcripts_sources = {} 8861 8862 # Get transcripts 8863 transcripts_file = ( 8864 param.get("calculation", {}) 8865 .get("calculations", {}) 8866 .get("NOMEN", {}) 8867 .get("options", {}) 8868 .get("transcripts", None) 8869 ) 8870 transcripts_file = full_path(transcripts_file) 8871 if transcripts_file: 8872 if os.path.exists(transcripts_file): 8873 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8874 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8875 transcripts_sources["file"] = transcripts_from_file 8876 else: 8877 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8878 log.error(msg_err) 8879 raise ValueError(msg_err) 8880 8881 # Get transcripts table 8882 transcripts_table = ( 8883 param.get("calculation", {}) 8884 .get("calculations", {}) 8885 .get("NOMEN", {}) 8886 .get("options", {}) 8887 .get("transcripts_table", self.get_table_variants()) 8888 ) 8889 # Get transcripts column 8890 transcripts_column = ( 8891 param.get("calculation", {}) 8892 .get("calculations", {}) 8893 .get("NOMEN", {}) 8894 .get("options", {}) 8895 .get("transcripts_column", None) 8896 ) 8897 8898 if transcripts_table and transcripts_column: 8899 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8900 # Explode if not exists 8901 self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8902 else: 8903 extra_field_transcript = f"NULL" 8904 8905 # Transcripts of preference source order 8906 transcripts_order = ( 8907 param.get("calculation", {}) 8908 .get("calculations", {}) 8909 .get("NOMEN", {}) 8910 .get("options", {}) 8911 .get("transcripts_order", ["column", "file"]) 8912 ) 8913 8914 # Transcripts from file 8915 transcripts = transcripts_sources.get("file", []) 8916 8917 # Explode HGVS field in column 8918 added_columns += self.explode_infos(fields=[hgvs_field]) 8919 8920 # extra infos 8921 extra_infos = self.get_extra_infos() 8922 extra_field = prefix + hgvs_field 8923 8924 if extra_field in extra_infos: 8925 8926 # Create dataframe 8927 dataframe_hgvs = self.get_query_to_df( 8928 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 8929 ) 8930 8931 # Create main NOMEN column 8932 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 8933 lambda x: find_nomen( 8934 hgvs=x.hgvs, 8935 transcript=x.transcript, 8936 transcripts=transcripts, 8937 pattern=nomen_pattern, 8938 transcripts_source_order=transcripts_order, 8939 ), 8940 axis=1, 8941 ) 8942 8943 # Explode NOMEN Structure and create SQL set for update 8944 sql_nomen_fields = [] 8945 for nomen_field in nomen_dict: 8946 8947 # Explode each field into a column 8948 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8949 lambda x: dict(x).get(nomen_field, "") 8950 ) 8951 8952 # Create VCF header field 8953 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8954 nomen_field, 8955 ".", 8956 "String", 8957 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8958 "howard calculation", 8959 "0", 8960 self.code_type_map.get("String"), 8961 ) 8962 sql_nomen_fields.append( 8963 f""" 8964 CASE 8965 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8966 THEN concat( 8967 ';{nomen_field}=', 8968 dataframe_hgvs."{nomen_field}" 8969 ) 8970 ELSE '' 8971 END 8972 """ 8973 ) 8974 8975 # SQL set for update 8976 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8977 8978 # Update 8979 sql_update = f""" 8980 UPDATE variants 8981 SET "INFO" = 8982 concat( 8983 CASE 8984 WHEN "INFO" IS NULL 8985 THEN '' 8986 ELSE "INFO" 8987 END, 8988 {sql_nomen_fields_set} 8989 ) 8990 FROM dataframe_hgvs 8991 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8992 AND variants."POS" = dataframe_hgvs."POS" 8993 AND variants."REF" = dataframe_hgvs."REF" 8994 AND variants."ALT" = dataframe_hgvs."ALT" 8995 """ 8996 self.conn.execute(sql_update) 8997 8998 # Delete dataframe 8999 del dataframe_hgvs 9000 gc.collect() 9001 9002 # Remove added columns 9003 for added_column in added_columns: 9004 self.drop_column(column=added_column) 9005 9006 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9007 """ 9008 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9009 pipeline/sample for a variant and updates the variant information in a VCF file. 9010 9011 :param tag: The `tag` parameter is a string that represents the annotation field for the 9012 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9013 VCF header and to update the corresponding field in the variants table, defaults to 9014 findbypipeline 9015 :type tag: str (optional) 9016 """ 9017 9018 # if FORMAT and samples 9019 if ( 9020 "FORMAT" in self.get_header_columns_as_list() 9021 and self.get_header_sample_list() 9022 ): 9023 9024 # findbypipeline annotation field 9025 findbypipeline_tag = tag 9026 9027 # VCF infos tags 9028 vcf_infos_tags = { 9029 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9030 } 9031 9032 # Prefix 9033 prefix = self.get_explode_infos_prefix() 9034 9035 # Field 9036 findbypipeline_infos = prefix + findbypipeline_tag 9037 9038 # Variants table 9039 table_variants = self.get_table_variants() 9040 9041 # Header 9042 vcf_reader = self.get_header() 9043 9044 # Create variant id 9045 variant_id_column = self.get_variant_id_column() 9046 added_columns = [variant_id_column] 9047 9048 # variant_id, FORMAT and samples 9049 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9050 self.get_header_sample_list() 9051 ) 9052 9053 # Create dataframe 9054 dataframe_findbypipeline = self.get_query_to_df( 9055 f""" SELECT {samples_fields} FROM {table_variants} """ 9056 ) 9057 9058 # Create findbypipeline column 9059 dataframe_findbypipeline[findbypipeline_infos] = ( 9060 dataframe_findbypipeline.apply( 9061 lambda row: findbypipeline( 9062 row, samples=self.get_header_sample_list() 9063 ), 9064 axis=1, 9065 ) 9066 ) 9067 9068 # Add snpeff_hgvs to header 9069 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9070 findbypipeline_tag, 9071 ".", 9072 "String", 9073 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9074 "howard calculation", 9075 "0", 9076 self.code_type_map.get("String"), 9077 ) 9078 9079 # Update 9080 sql_update = f""" 9081 UPDATE variants 9082 SET "INFO" = 9083 concat( 9084 CASE 9085 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9086 THEN '' 9087 ELSE concat("INFO", ';') 9088 END, 9089 CASE 9090 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9091 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9092 THEN concat( 9093 '{findbypipeline_tag}=', 9094 dataframe_findbypipeline."{findbypipeline_infos}" 9095 ) 9096 ELSE '' 9097 END 9098 ) 9099 FROM dataframe_findbypipeline 9100 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9101 """ 9102 self.conn.execute(sql_update) 9103 9104 # Remove added columns 9105 for added_column in added_columns: 9106 self.drop_column(column=added_column) 9107 9108 # Delete dataframe 9109 del dataframe_findbypipeline 9110 gc.collect() 9111 9112 def calculation_genotype_concordance(self) -> None: 9113 """ 9114 The function `calculation_genotype_concordance` calculates the genotype concordance for 9115 multi-caller VCF files and updates the variant information in the database. 9116 """ 9117 9118 # if FORMAT and samples 9119 if ( 9120 "FORMAT" in self.get_header_columns_as_list() 9121 and self.get_header_sample_list() 9122 ): 9123 9124 # genotypeconcordance annotation field 9125 genotypeconcordance_tag = "genotypeconcordance" 9126 9127 # VCF infos tags 9128 vcf_infos_tags = { 9129 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9130 } 9131 9132 # Prefix 9133 prefix = self.get_explode_infos_prefix() 9134 9135 # Field 9136 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9137 9138 # Variants table 9139 table_variants = self.get_table_variants() 9140 9141 # Header 9142 vcf_reader = self.get_header() 9143 9144 # Create variant id 9145 variant_id_column = self.get_variant_id_column() 9146 added_columns = [variant_id_column] 9147 9148 # variant_id, FORMAT and samples 9149 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9150 self.get_header_sample_list() 9151 ) 9152 9153 # Create dataframe 9154 dataframe_genotypeconcordance = self.get_query_to_df( 9155 f""" SELECT {samples_fields} FROM {table_variants} """ 9156 ) 9157 9158 # Create genotypeconcordance column 9159 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9160 dataframe_genotypeconcordance.apply( 9161 lambda row: genotypeconcordance( 9162 row, samples=self.get_header_sample_list() 9163 ), 9164 axis=1, 9165 ) 9166 ) 9167 9168 # Add genotypeconcordance to header 9169 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9170 genotypeconcordance_tag, 9171 ".", 9172 "String", 9173 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9174 "howard calculation", 9175 "0", 9176 self.code_type_map.get("String"), 9177 ) 9178 9179 # Update 9180 sql_update = f""" 9181 UPDATE variants 9182 SET "INFO" = 9183 concat( 9184 CASE 9185 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9186 THEN '' 9187 ELSE concat("INFO", ';') 9188 END, 9189 CASE 9190 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9191 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9192 THEN concat( 9193 '{genotypeconcordance_tag}=', 9194 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9195 ) 9196 ELSE '' 9197 END 9198 ) 9199 FROM dataframe_genotypeconcordance 9200 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9201 """ 9202 self.conn.execute(sql_update) 9203 9204 # Remove added columns 9205 for added_column in added_columns: 9206 self.drop_column(column=added_column) 9207 9208 # Delete dataframe 9209 del dataframe_genotypeconcordance 9210 gc.collect() 9211 9212 def calculation_barcode(self, tag: str = "barcode") -> None: 9213 """ 9214 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9215 updates the INFO field in the file with the calculated barcode values. 9216 9217 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9218 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9219 the default tag name is set to "barcode", defaults to barcode 9220 :type tag: str (optional) 9221 """ 9222 9223 # if FORMAT and samples 9224 if ( 9225 "FORMAT" in self.get_header_columns_as_list() 9226 and self.get_header_sample_list() 9227 ): 9228 9229 # barcode annotation field 9230 if not tag: 9231 tag = "barcode" 9232 9233 # VCF infos tags 9234 vcf_infos_tags = { 9235 tag: "barcode calculation (VaRank)", 9236 } 9237 9238 # Prefix 9239 prefix = self.get_explode_infos_prefix() 9240 9241 # Field 9242 barcode_infos = prefix + tag 9243 9244 # Variants table 9245 table_variants = self.get_table_variants() 9246 9247 # Header 9248 vcf_reader = self.get_header() 9249 9250 # Create variant id 9251 variant_id_column = self.get_variant_id_column() 9252 added_columns = [variant_id_column] 9253 9254 # variant_id, FORMAT and samples 9255 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9256 self.get_header_sample_list() 9257 ) 9258 9259 # Create dataframe 9260 dataframe_barcode = self.get_query_to_df( 9261 f""" SELECT {samples_fields} FROM {table_variants} """ 9262 ) 9263 9264 # Create barcode column 9265 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9266 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9267 ) 9268 9269 # Add barcode to header 9270 vcf_reader.infos[tag] = vcf.parser._Info( 9271 tag, 9272 ".", 9273 "String", 9274 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9275 "howard calculation", 9276 "0", 9277 self.code_type_map.get("String"), 9278 ) 9279 9280 # Update 9281 sql_update = f""" 9282 UPDATE {table_variants} 9283 SET "INFO" = 9284 concat( 9285 CASE 9286 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9287 THEN '' 9288 ELSE concat("INFO", ';') 9289 END, 9290 CASE 9291 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9292 AND dataframe_barcode."{barcode_infos}" NOT NULL 9293 THEN concat( 9294 '{tag}=', 9295 dataframe_barcode."{barcode_infos}" 9296 ) 9297 ELSE '' 9298 END 9299 ) 9300 FROM dataframe_barcode 9301 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9302 """ 9303 self.conn.execute(sql_update) 9304 9305 # Remove added columns 9306 for added_column in added_columns: 9307 self.drop_column(column=added_column) 9308 9309 # Delete dataframe 9310 del dataframe_barcode 9311 gc.collect() 9312 9313 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9314 """ 9315 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9316 and updates the INFO field in the file with the calculated barcode values. 9317 9318 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9319 the barcode tag that will be added to the VCF file during the calculation process. If no value 9320 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9321 :type tag: str (optional) 9322 """ 9323 9324 # if FORMAT and samples 9325 if ( 9326 "FORMAT" in self.get_header_columns_as_list() 9327 and self.get_header_sample_list() 9328 ): 9329 9330 # barcode annotation field 9331 if not tag: 9332 tag = "BCF" 9333 9334 # VCF infos tags 9335 vcf_infos_tags = { 9336 tag: "barcode family calculation", 9337 f"{tag}S": "barcode family samples", 9338 } 9339 9340 # Param 9341 param = self.get_param() 9342 log.debug(f"param={param}") 9343 9344 # Prefix 9345 prefix = self.get_explode_infos_prefix() 9346 9347 # PED param 9348 ped = ( 9349 param.get("calculation", {}) 9350 .get("calculations", {}) 9351 .get("BARCODEFAMILY", {}) 9352 .get("family_pedigree", None) 9353 ) 9354 log.debug(f"ped={ped}") 9355 9356 # Load PED 9357 if ped: 9358 9359 # Pedigree is a file 9360 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9361 log.debug("Pedigree is file") 9362 with open(full_path(ped)) as ped: 9363 ped = json.load(ped) 9364 9365 # Pedigree is a string 9366 elif isinstance(ped, str): 9367 log.debug("Pedigree is str") 9368 try: 9369 ped = json.loads(ped) 9370 log.debug("Pedigree is json str") 9371 except ValueError as e: 9372 ped_samples = ped.split(",") 9373 ped = {} 9374 for ped_sample in ped_samples: 9375 ped[ped_sample] = ped_sample 9376 9377 # Pedigree is a dict 9378 elif isinstance(ped, dict): 9379 log.debug("Pedigree is dict") 9380 9381 # Pedigree is not well formatted 9382 else: 9383 msg_error = "Pedigree not well formatted" 9384 log.error(msg_error) 9385 raise ValueError(msg_error) 9386 9387 # Construct list 9388 ped_samples = list(ped.values()) 9389 9390 else: 9391 log.debug("Pedigree not defined. Take all samples") 9392 ped_samples = self.get_header_sample_list() 9393 ped = {} 9394 for ped_sample in ped_samples: 9395 ped[ped_sample] = ped_sample 9396 9397 # Check pedigree 9398 if not ped or len(ped) == 0: 9399 msg_error = f"Error in pedigree: samples {ped_samples}" 9400 log.error(msg_error) 9401 raise ValueError(msg_error) 9402 9403 # Log 9404 log.info( 9405 "Calculation 'BARCODEFAMILY' - Samples: " 9406 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9407 ) 9408 log.debug(f"ped_samples={ped_samples}") 9409 9410 # Field 9411 barcode_infos = prefix + tag 9412 9413 # Variants table 9414 table_variants = self.get_table_variants() 9415 9416 # Header 9417 vcf_reader = self.get_header() 9418 9419 # Create variant id 9420 variant_id_column = self.get_variant_id_column() 9421 added_columns = [variant_id_column] 9422 9423 # variant_id, FORMAT and samples 9424 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9425 ped_samples 9426 ) 9427 9428 # Create dataframe 9429 dataframe_barcode = self.get_query_to_df( 9430 f""" SELECT {samples_fields} FROM {table_variants} """ 9431 ) 9432 9433 # Create barcode column 9434 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9435 lambda row: barcode(row, samples=ped_samples), axis=1 9436 ) 9437 9438 # Add barcode family to header 9439 # Add vaf_normalization to header 9440 vcf_reader.formats[tag] = vcf.parser._Format( 9441 id=tag, 9442 num=".", 9443 type="String", 9444 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9445 type_code=self.code_type_map.get("String"), 9446 ) 9447 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9448 id=f"{tag}S", 9449 num=".", 9450 type="String", 9451 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9452 type_code=self.code_type_map.get("String"), 9453 ) 9454 9455 # Update 9456 # for sample in ped_samples: 9457 sql_update_set = [] 9458 for sample in self.get_header_sample_list() + ["FORMAT"]: 9459 if sample in ped_samples: 9460 value = f'dataframe_barcode."{barcode_infos}"' 9461 value_samples = "'" + ",".join(ped_samples) + "'" 9462 elif sample == "FORMAT": 9463 value = f"'{tag}'" 9464 value_samples = f"'{tag}S'" 9465 else: 9466 value = "'.'" 9467 value_samples = "'.'" 9468 format_regex = r"[a-zA-Z0-9\s]" 9469 sql_update_set.append( 9470 f""" 9471 "{sample}" = 9472 concat( 9473 CASE 9474 WHEN {table_variants}."{sample}" = './.' 9475 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9476 ELSE {table_variants}."{sample}" 9477 END, 9478 ':', 9479 {value}, 9480 ':', 9481 {value_samples} 9482 ) 9483 """ 9484 ) 9485 9486 sql_update_set_join = ", ".join(sql_update_set) 9487 sql_update = f""" 9488 UPDATE {table_variants} 9489 SET {sql_update_set_join} 9490 FROM dataframe_barcode 9491 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9492 """ 9493 self.conn.execute(sql_update) 9494 9495 # Remove added columns 9496 for added_column in added_columns: 9497 self.drop_column(column=added_column) 9498 9499 # Delete dataframe 9500 del dataframe_barcode 9501 gc.collect() 9502 9503 def calculation_trio(self) -> None: 9504 """ 9505 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9506 information to the INFO field of each variant. 9507 """ 9508 9509 # if FORMAT and samples 9510 if ( 9511 "FORMAT" in self.get_header_columns_as_list() 9512 and self.get_header_sample_list() 9513 ): 9514 9515 # trio annotation field 9516 trio_tag = "trio" 9517 9518 # VCF infos tags 9519 vcf_infos_tags = { 9520 "trio": "trio calculation", 9521 } 9522 9523 # Param 9524 param = self.get_param() 9525 9526 # Prefix 9527 prefix = self.get_explode_infos_prefix() 9528 9529 # Trio param 9530 trio_ped = ( 9531 param.get("calculation", {}) 9532 .get("calculations", {}) 9533 .get("TRIO", {}) 9534 .get("trio_pedigree", None) 9535 ) 9536 9537 # Load trio 9538 if trio_ped: 9539 9540 # Trio pedigree is a file 9541 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9542 log.debug("TRIO pedigree is file") 9543 with open(full_path(trio_ped)) as trio_ped: 9544 trio_ped = json.load(trio_ped) 9545 9546 # Trio pedigree is a string 9547 elif isinstance(trio_ped, str): 9548 log.debug("TRIO pedigree is str") 9549 try: 9550 trio_ped = json.loads(trio_ped) 9551 log.debug("TRIO pedigree is json str") 9552 except ValueError as e: 9553 trio_samples = trio_ped.split(",") 9554 if len(trio_samples) == 3: 9555 trio_ped = { 9556 "father": trio_samples[0], 9557 "mother": trio_samples[1], 9558 "child": trio_samples[2], 9559 } 9560 log.debug("TRIO pedigree is list str") 9561 else: 9562 msg_error = "TRIO pedigree not well formatted" 9563 log.error(msg_error) 9564 raise ValueError(msg_error) 9565 9566 # Trio pedigree is a dict 9567 elif isinstance(trio_ped, dict): 9568 log.debug("TRIO pedigree is dict") 9569 9570 # Trio pedigree is not well formatted 9571 else: 9572 msg_error = "TRIO pedigree not well formatted" 9573 log.error(msg_error) 9574 raise ValueError(msg_error) 9575 9576 # Construct trio list 9577 trio_samples = [ 9578 trio_ped.get("father", ""), 9579 trio_ped.get("mother", ""), 9580 trio_ped.get("child", ""), 9581 ] 9582 9583 else: 9584 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9585 samples_list = self.get_header_sample_list() 9586 if len(samples_list) >= 3: 9587 trio_samples = self.get_header_sample_list()[0:3] 9588 trio_ped = { 9589 "father": trio_samples[0], 9590 "mother": trio_samples[1], 9591 "child": trio_samples[2], 9592 } 9593 else: 9594 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9595 log.error(msg_error) 9596 raise ValueError(msg_error) 9597 9598 # Check trio pedigree 9599 if not trio_ped or len(trio_ped) != 3: 9600 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9601 log.error(msg_error) 9602 raise ValueError(msg_error) 9603 9604 # Log 9605 log.info( 9606 f"Calculation 'TRIO' - Samples: " 9607 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9608 ) 9609 9610 # Field 9611 trio_infos = prefix + trio_tag 9612 9613 # Variants table 9614 table_variants = self.get_table_variants() 9615 9616 # Header 9617 vcf_reader = self.get_header() 9618 9619 # Create variant id 9620 variant_id_column = self.get_variant_id_column() 9621 added_columns = [variant_id_column] 9622 9623 # variant_id, FORMAT and samples 9624 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9625 self.get_header_sample_list() 9626 ) 9627 9628 # Create dataframe 9629 dataframe_trio = self.get_query_to_df( 9630 f""" SELECT {samples_fields} FROM {table_variants} """ 9631 ) 9632 9633 # Create trio column 9634 dataframe_trio[trio_infos] = dataframe_trio.apply( 9635 lambda row: trio(row, samples=trio_samples), axis=1 9636 ) 9637 9638 # Add trio to header 9639 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9640 trio_tag, 9641 ".", 9642 "String", 9643 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9644 "howard calculation", 9645 "0", 9646 self.code_type_map.get("String"), 9647 ) 9648 9649 # Update 9650 sql_update = f""" 9651 UPDATE {table_variants} 9652 SET "INFO" = 9653 concat( 9654 CASE 9655 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9656 THEN '' 9657 ELSE concat("INFO", ';') 9658 END, 9659 CASE 9660 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9661 AND dataframe_trio."{trio_infos}" NOT NULL 9662 THEN concat( 9663 '{trio_tag}=', 9664 dataframe_trio."{trio_infos}" 9665 ) 9666 ELSE '' 9667 END 9668 ) 9669 FROM dataframe_trio 9670 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9671 """ 9672 self.conn.execute(sql_update) 9673 9674 # Remove added columns 9675 for added_column in added_columns: 9676 self.drop_column(column=added_column) 9677 9678 # Delete dataframe 9679 del dataframe_trio 9680 gc.collect() 9681 9682 def calculation_vaf_normalization(self) -> None: 9683 """ 9684 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9685 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9686 :return: The function does not return anything. 9687 """ 9688 9689 # if FORMAT and samples 9690 if ( 9691 "FORMAT" in self.get_header_columns_as_list() 9692 and self.get_header_sample_list() 9693 ): 9694 9695 # vaf_normalization annotation field 9696 vaf_normalization_tag = "VAF" 9697 9698 # VCF infos tags 9699 vcf_infos_tags = { 9700 "VAF": "VAF Variant Frequency", 9701 } 9702 9703 # Prefix 9704 prefix = self.get_explode_infos_prefix() 9705 9706 # Variants table 9707 table_variants = self.get_table_variants() 9708 9709 # Header 9710 vcf_reader = self.get_header() 9711 9712 # Do not calculate if VAF already exists 9713 if "VAF" in vcf_reader.formats: 9714 log.debug("VAF already on genotypes") 9715 return 9716 9717 # Create variant id 9718 variant_id_column = self.get_variant_id_column() 9719 added_columns = [variant_id_column] 9720 9721 # variant_id, FORMAT and samples 9722 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9723 f""" "{sample}" """ for sample in self.get_header_sample_list() 9724 ) 9725 9726 # Create dataframe 9727 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9728 log.debug(f"query={query}") 9729 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9730 9731 vaf_normalization_set = [] 9732 9733 # for each sample vaf_normalization 9734 for sample in self.get_header_sample_list(): 9735 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9736 lambda row: vaf_normalization(row, sample=sample), axis=1 9737 ) 9738 vaf_normalization_set.append( 9739 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9740 ) 9741 9742 # Add VAF to FORMAT 9743 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9744 "FORMAT" 9745 ].apply(lambda x: str(x) + ":VAF") 9746 vaf_normalization_set.append( 9747 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9748 ) 9749 9750 # Add vaf_normalization to header 9751 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9752 id=vaf_normalization_tag, 9753 num="1", 9754 type="Float", 9755 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9756 type_code=self.code_type_map.get("Float"), 9757 ) 9758 9759 # Create fields to add in INFO 9760 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9761 9762 # Update 9763 sql_update = f""" 9764 UPDATE {table_variants} 9765 SET {sql_vaf_normalization_set} 9766 FROM dataframe_vaf_normalization 9767 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9768 9769 """ 9770 self.conn.execute(sql_update) 9771 9772 # Remove added columns 9773 for added_column in added_columns: 9774 self.drop_column(column=added_column) 9775 9776 # Delete dataframe 9777 del dataframe_vaf_normalization 9778 gc.collect() 9779 9780 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9781 """ 9782 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9783 field in a VCF file and updates the INFO column of the variants table with the calculated 9784 statistics. 9785 9786 :param info: The `info` parameter is a string that represents the type of information for which 9787 genotype statistics are calculated. It is used to generate various VCF info tags for the 9788 statistics, such as the number of occurrences, the list of values, the minimum value, the 9789 maximum value, the mean, the median, defaults to VAF 9790 :type info: str (optional) 9791 """ 9792 9793 # if FORMAT and samples 9794 if ( 9795 "FORMAT" in self.get_header_columns_as_list() 9796 and self.get_header_sample_list() 9797 ): 9798 9799 # vaf_stats annotation field 9800 vaf_stats_tag = info + "_stats" 9801 9802 # VCF infos tags 9803 vcf_infos_tags = { 9804 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9805 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9806 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9807 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9808 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9809 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9810 info 9811 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9812 } 9813 9814 # Prefix 9815 prefix = self.get_explode_infos_prefix() 9816 9817 # Field 9818 vaf_stats_infos = prefix + vaf_stats_tag 9819 9820 # Variants table 9821 table_variants = self.get_table_variants() 9822 9823 # Header 9824 vcf_reader = self.get_header() 9825 9826 # Create variant id 9827 variant_id_column = self.get_variant_id_column() 9828 added_columns = [variant_id_column] 9829 9830 # variant_id, FORMAT and samples 9831 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9832 self.get_header_sample_list() 9833 ) 9834 9835 # Create dataframe 9836 dataframe_vaf_stats = self.get_query_to_df( 9837 f""" SELECT {samples_fields} FROM {table_variants} """ 9838 ) 9839 9840 # Create vaf_stats column 9841 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9842 lambda row: genotype_stats( 9843 row, samples=self.get_header_sample_list(), info=info 9844 ), 9845 axis=1, 9846 ) 9847 9848 # List of vcf tags 9849 sql_vaf_stats_fields = [] 9850 9851 # Check all VAF stats infos 9852 for stat in vcf_infos_tags: 9853 9854 # Extract stats 9855 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9856 lambda x: dict(x).get(stat, "") 9857 ) 9858 9859 # Add snpeff_hgvs to header 9860 vcf_reader.infos[stat] = vcf.parser._Info( 9861 stat, 9862 ".", 9863 "String", 9864 vcf_infos_tags.get(stat, "genotype statistics"), 9865 "howard calculation", 9866 "0", 9867 self.code_type_map.get("String"), 9868 ) 9869 9870 if len(sql_vaf_stats_fields): 9871 sep = ";" 9872 else: 9873 sep = "" 9874 9875 # Create fields to add in INFO 9876 sql_vaf_stats_fields.append( 9877 f""" 9878 CASE 9879 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9880 THEN concat( 9881 '{sep}{stat}=', 9882 dataframe_vaf_stats."{stat}" 9883 ) 9884 ELSE '' 9885 END 9886 """ 9887 ) 9888 9889 # SQL set for update 9890 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9891 9892 # Update 9893 sql_update = f""" 9894 UPDATE {table_variants} 9895 SET "INFO" = 9896 concat( 9897 CASE 9898 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9899 THEN '' 9900 ELSE concat("INFO", ';') 9901 END, 9902 {sql_vaf_stats_fields_set} 9903 ) 9904 FROM dataframe_vaf_stats 9905 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9906 9907 """ 9908 self.conn.execute(sql_update) 9909 9910 # Remove added columns 9911 for added_column in added_columns: 9912 self.drop_column(column=added_column) 9913 9914 # Delete dataframe 9915 del dataframe_vaf_stats 9916 gc.collect() 9917 9918 def calculation_transcripts_annotation( 9919 self, info_json: str = None, info_format: str = None 9920 ) -> None: 9921 """ 9922 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9923 field to it if transcripts are available. 9924 9925 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9926 is a string parameter that represents the information field to be used in the transcripts JSON. 9927 It is used to specify the JSON format for the transcripts information. If no value is provided 9928 when calling the method, it defaults to " 9929 :type info_json: str 9930 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9931 method is a string parameter that specifies the format of the information field to be used in 9932 the transcripts JSON. It is used to define the format of the information field 9933 :type info_format: str 9934 """ 9935 9936 # Create transcripts table 9937 transcripts_table = self.create_transcript_view() 9938 9939 # Add info field 9940 if transcripts_table: 9941 self.transcript_view_to_variants( 9942 transcripts_table=transcripts_table, 9943 transcripts_info_field_json=info_json, 9944 transcripts_info_field_format=info_format, 9945 ) 9946 else: 9947 log.info("No Transcripts to process. Check param.json file configuration") 9948 9949 def calculation_transcripts_prioritization(self) -> None: 9950 """ 9951 The function `calculation_transcripts_prioritization` creates a transcripts table and 9952 prioritizes transcripts based on certain criteria. 9953 """ 9954 9955 # Create transcripts table 9956 transcripts_table = self.create_transcript_view() 9957 9958 # Add info field 9959 if transcripts_table: 9960 self.transcripts_prioritization(transcripts_table=transcripts_table) 9961 else: 9962 log.info("No Transcripts to process. Check param.json file configuration") 9963 9964 def calculation_transcripts_export(self) -> None: 9965 """ """ 9966 9967 # Create transcripts table 9968 transcripts_table = self.create_transcript_view() 9969 9970 # Add info field 9971 if transcripts_table: 9972 self.transcripts_export(transcripts_table=transcripts_table) 9973 else: 9974 log.info("No Transcripts to process. Check param.json file configuration") 9975 9976 ############### 9977 # Transcripts # 9978 ############### 9979 9980 def transcripts_export( 9981 self, transcripts_table: str = None, param: dict = {} 9982 ) -> bool: 9983 """ """ 9984 9985 log.debug("Start transcripts export...") 9986 9987 # Param 9988 if not param: 9989 param = self.get_param() 9990 9991 # Param export 9992 param_transcript_export = param.get("transcripts", {}).get("export", {}) 9993 9994 # Output file 9995 transcripts_export_output = param_transcript_export.get("output", None) 9996 9997 if not param_transcript_export or not transcripts_export_output: 9998 log.warning(f"No transcriipts export parameters defined!") 9999 return False 10000 10001 # List of transcripts annotations 10002 query_describe = f""" 10003 SELECT column_name 10004 FROM ( 10005 DESCRIBE SELECT * FROM {transcripts_table} 10006 ) 10007 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10008 """ 10009 transcripts_annotations_list = list( 10010 self.get_query_to_df(query=query_describe)["column_name"] 10011 ) 10012 10013 # Create transcripts table for export 10014 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10015 random.choices(string.ascii_uppercase + string.digits, k=10) 10016 ) 10017 query_create_transcripts_table_export = f""" 10018 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10019 """ 10020 self.execute_query(query=query_create_transcripts_table_export) 10021 10022 # Output file format 10023 transcripts_export_output_format = get_file_format( 10024 filename=transcripts_export_output 10025 ) 10026 10027 # Format VCF - construct INFO 10028 if transcripts_export_output_format in ["vcf"]: 10029 10030 # Construct query update INFO and header 10031 query_update_info = [] 10032 for field in transcripts_annotations_list: 10033 10034 # If field not in header 10035 if field not in self.get_header_infos_list(): 10036 10037 # Add PZ Transcript in header 10038 self.get_header().infos[field] = vcf.parser._Info( 10039 field, 10040 ".", 10041 "String", 10042 f"Annotation '{field}' from transcript view", 10043 "unknown", 10044 "unknown", 10045 0, 10046 ) 10047 10048 # Add field as INFO/tag 10049 query_update_info.append( 10050 f""" 10051 CASE 10052 WHEN "{field}" IS NOT NULL 10053 THEN concat('{field}=', "{field}", ';') 10054 ELSE '' 10055 END 10056 """ 10057 ) 10058 10059 # Query param 10060 query_update_info_value = ( 10061 f""" concat('', {", ".join(query_update_info)}) """ 10062 ) 10063 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10064 10065 else: 10066 10067 # Query param 10068 query_update_info_value = f""" NULL """ 10069 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10070 10071 # Update query INFO column 10072 query_update = f""" 10073 UPDATE {transcripts_table_export} 10074 SET INFO = {query_update_info_value} 10075 10076 """ 10077 self.execute_query(query=query_update) 10078 10079 # Export 10080 self.export_output( 10081 output_file=transcripts_export_output, 10082 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10083 ) 10084 10085 # Drop transcripts export table 10086 query_drop_transcripts_table_export = f""" 10087 DROP TABLE {transcripts_table_export} 10088 """ 10089 self.execute_query(query=query_drop_transcripts_table_export) 10090 10091 def transcripts_prioritization( 10092 self, transcripts_table: str = None, param: dict = {} 10093 ) -> bool: 10094 """ 10095 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10096 and updates the variants table with the prioritized information. 10097 10098 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10099 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10100 This parameter is used to identify the table where the transcripts data is stored for the 10101 prioritization process 10102 :type transcripts_table: str 10103 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10104 that contains various configuration settings for the prioritization process of transcripts. It 10105 is used to customize the behavior of the prioritization algorithm and includes settings such as 10106 the prefix for prioritization fields, default profiles, and other 10107 :type param: dict 10108 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10109 transcripts prioritization process is successfully completed, and `False` if there are any 10110 issues or if no profile is defined for transcripts prioritization. 10111 """ 10112 10113 log.debug("Start transcripts prioritization...") 10114 10115 # Param 10116 if not param: 10117 param = self.get_param() 10118 10119 # Variants table 10120 table_variants = self.get_table_variants() 10121 10122 # Transcripts table 10123 if transcripts_table is None: 10124 transcripts_table = self.create_transcript_view( 10125 transcripts_table="transcripts", param=param 10126 ) 10127 if transcripts_table is None: 10128 msg_err = "No Transcripts table availalble" 10129 log.error(msg_err) 10130 raise ValueError(msg_err) 10131 log.debug(f"transcripts_table={transcripts_table}") 10132 10133 # Get transcripts columns 10134 columns_as_list_query = f""" 10135 DESCRIBE {transcripts_table} 10136 """ 10137 columns_as_list = list( 10138 self.get_query_to_df(columns_as_list_query)["column_name"] 10139 ) 10140 10141 # Create INFO if not exists 10142 if "INFO" not in columns_as_list: 10143 query_add_info = f""" 10144 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10145 """ 10146 self.execute_query(query_add_info) 10147 10148 # Prioritization param and Force only PZ Score and Flag 10149 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10150 10151 # PZ profile by default 10152 pz_profile_default = ( 10153 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10154 ) 10155 10156 # Exit if no profile 10157 if pz_profile_default is None: 10158 log.warning("No profile defined for transcripts prioritization") 10159 return False 10160 10161 # PZ fields 10162 pz_param_pzfields = {} 10163 10164 # PZ field transcripts 10165 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10166 10167 # Add PZ Transcript in header 10168 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10169 pz_fields_transcripts, 10170 ".", 10171 "String", 10172 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10173 "unknown", 10174 "unknown", 10175 code_type_map["String"], 10176 ) 10177 10178 # Mandatory fields 10179 pz_mandatory_fields_list = [ 10180 "Score", 10181 "Flag", 10182 "Tags", 10183 "Comment", 10184 "Infos", 10185 "Class", 10186 ] 10187 pz_mandatory_fields = [] 10188 for pz_mandatory_field in pz_mandatory_fields_list: 10189 pz_mandatory_fields.append( 10190 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10191 ) 10192 10193 # PZ fields in param 10194 for pz_field in pz_param.get("pzfields", []): 10195 if pz_field in pz_mandatory_fields_list: 10196 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10197 pz_param.get("pzprefix", "PTZ") + pz_field 10198 ) 10199 else: 10200 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10201 pz_param_pzfields[pz_field] = pz_field_new 10202 10203 # Add PZ Transcript in header 10204 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10205 pz_field_new, 10206 ".", 10207 "String", 10208 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10209 "unknown", 10210 "unknown", 10211 code_type_map["String"], 10212 ) 10213 10214 # PZ fields param 10215 pz_param["pzfields"] = pz_mandatory_fields 10216 10217 # Prioritization 10218 prioritization_result = self.prioritization( 10219 table=transcripts_table, 10220 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10221 ) 10222 if not prioritization_result: 10223 log.warning("Transcripts prioritization not processed") 10224 return False 10225 10226 # PZ fields sql query 10227 query_update_select_list = [] 10228 query_update_concat_list = [] 10229 query_update_order_list = [] 10230 for pz_param_pzfield in set( 10231 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10232 ): 10233 query_update_select_list.append(f" {pz_param_pzfield}, ") 10234 10235 for pz_param_pzfield in pz_param_pzfields: 10236 query_update_concat_list.append( 10237 f""" 10238 , CASE 10239 WHEN {pz_param_pzfield} IS NOT NULL 10240 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10241 ELSE '' 10242 END 10243 """ 10244 ) 10245 10246 # Order by 10247 pz_orders = ( 10248 param.get("transcripts", {}) 10249 .get("prioritization", {}) 10250 .get("prioritization_transcripts_order", {}) 10251 ) 10252 if not pz_orders: 10253 pz_orders = { 10254 pz_param.get("pzprefix", "PTZ") + "Flag": "ASC", 10255 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10256 } 10257 for pz_order in pz_orders: 10258 query_update_order_list.append( 10259 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10260 ) 10261 10262 # Fields to explode 10263 fields_to_explode = ( 10264 list(pz_param_pzfields.keys()) 10265 + pz_mandatory_fields 10266 + list(pz_orders.keys()) 10267 ) 10268 # Remove transcript column as a specific transcript column 10269 if "transcript" in fields_to_explode: 10270 fields_to_explode.remove("transcript") 10271 10272 # Fields intranscripts table 10273 query_transcripts_table = f""" 10274 DESCRIBE SELECT * FROM {transcripts_table} 10275 """ 10276 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10277 10278 # Check fields to explode 10279 for field_to_explode in fields_to_explode: 10280 if field_to_explode not in self.get_header_infos_list() + list( 10281 query_transcripts_table.column_name 10282 ): 10283 msg_err = f"INFO/{field_to_explode} NOT IN header" 10284 log.error(msg_err) 10285 raise ValueError(msg_err) 10286 10287 # Explode fields to explode 10288 self.explode_infos( 10289 table=transcripts_table, 10290 fields=fields_to_explode, 10291 ) 10292 10293 # Transcript preference file 10294 transcripts_preference_file = ( 10295 param.get("transcripts", {}) 10296 .get("prioritization", {}) 10297 .get("prioritization_transcripts", {}) 10298 ) 10299 transcripts_preference_file = full_path(transcripts_preference_file) 10300 10301 # Transcript preference forced 10302 transcript_preference_force = ( 10303 param.get("transcripts", {}) 10304 .get("prioritization", {}) 10305 .get("prioritization_transcripts_force", False) 10306 ) 10307 # Transcript version forced 10308 transcript_version_force = ( 10309 param.get("transcripts", {}) 10310 .get("prioritization", {}) 10311 .get("prioritization_transcripts_version_force", False) 10312 ) 10313 10314 # Transcripts Ranking 10315 if transcripts_preference_file: 10316 10317 # Transcripts file to dataframe 10318 if os.path.exists(transcripts_preference_file): 10319 transcripts_preference_dataframe = transcripts_file_to_df( 10320 transcripts_preference_file 10321 ) 10322 else: 10323 log.error( 10324 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10325 ) 10326 raise ValueError( 10327 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10328 ) 10329 10330 # Order by depending to transcript preference forcing 10331 if transcript_preference_force: 10332 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10333 else: 10334 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10335 10336 # Transcript columns joined depend on version consideration 10337 if transcript_version_force: 10338 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10339 else: 10340 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10341 10342 # Query ranking for update 10343 query_update_ranking = f""" 10344 SELECT 10345 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10346 ROW_NUMBER() OVER ( 10347 PARTITION BY "#CHROM", POS, REF, ALT 10348 ORDER BY {order_by} 10349 ) AS rn 10350 FROM {transcripts_table} 10351 LEFT JOIN 10352 ( 10353 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10354 FROM transcripts_preference_dataframe 10355 ) AS transcripts_preference 10356 ON {transcripts_version_join} 10357 """ 10358 10359 else: 10360 10361 # Query ranking for update 10362 query_update_ranking = f""" 10363 SELECT 10364 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10365 ROW_NUMBER() OVER ( 10366 PARTITION BY "#CHROM", POS, REF, ALT 10367 ORDER BY {" , ".join(query_update_order_list)} 10368 ) AS rn 10369 FROM {transcripts_table} 10370 """ 10371 10372 # Export Transcripts prioritization infos to variants table 10373 query_update = f""" 10374 WITH RankedTranscripts AS ( 10375 {query_update_ranking} 10376 ) 10377 UPDATE {table_variants} 10378 SET 10379 INFO = CONCAT(CASE 10380 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10381 THEN '' 10382 ELSE concat("INFO", ';') 10383 END, 10384 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10385 ) 10386 FROM 10387 RankedTranscripts 10388 WHERE 10389 rn = 1 10390 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10391 AND variants."POS" = RankedTranscripts."POS" 10392 AND variants."REF" = RankedTranscripts."REF" 10393 AND variants."ALT" = RankedTranscripts."ALT" 10394 """ 10395 10396 # log.debug(f"query_update={query_update}") 10397 self.execute_query(query=query_update) 10398 10399 # Return 10400 return True 10401 10402 def create_transcript_view_from_columns_map( 10403 self, 10404 transcripts_table: str = "transcripts", 10405 columns_maps: dict = {}, 10406 added_columns: list = [], 10407 temporary_tables: list = None, 10408 annotation_fields: list = None, 10409 column_rename: dict = {}, 10410 column_clean: bool = False, 10411 column_case: str = None, 10412 ) -> tuple[list, list, list]: 10413 """ 10414 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10415 specified columns mapping for transcripts data. 10416 10417 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10418 of the table where the transcripts data is stored or will be stored in the database. This table 10419 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10420 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10421 :type transcripts_table: str (optional) 10422 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10423 about how to map columns from a transcripts table to create a view. Each entry in the 10424 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10425 typically includes details such as the main transcript column and additional information columns 10426 :type columns_maps: dict 10427 :param added_columns: The `added_columns` parameter in the 10428 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10429 that will be added to the view being created based on the columns map provided. These columns 10430 are generated by exploding the transcript information columns along with the main transcript 10431 column 10432 :type added_columns: list 10433 :param temporary_tables: The `temporary_tables` parameter in the 10434 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10435 tables created during the process of creating a transcript view from a columns map. These 10436 temporary tables are used to store intermediate results or transformations before the final view 10437 is generated 10438 :type temporary_tables: list 10439 :param annotation_fields: The `annotation_fields` parameter in the 10440 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10441 used for annotation in the query view creation process. These fields are extracted from the 10442 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10443 :type annotation_fields: list 10444 :param column_rename: The `column_rename` parameter in the 10445 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10446 custom renaming for columns during the creation of the temporary table view. This parameter 10447 provides a mapping of original column names to the desired renamed column names. By using this 10448 parameter, 10449 :type column_rename: dict 10450 :param column_clean: The `column_clean` parameter in the 10451 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10452 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10453 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10454 False 10455 :type column_clean: bool (optional) 10456 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10457 function is used to specify the case transformation to be applied to the columns during the view 10458 creation process. It allows you to control whether the column values should be converted to 10459 lowercase, uppercase, or remain unchanged 10460 :type column_case: str 10461 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10462 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10463 """ 10464 10465 log.debug("Start transcrpts view creation from columns map...") 10466 10467 # "from_columns_map": [ 10468 # { 10469 # "transcripts_column": "Ensembl_transcriptid", 10470 # "transcripts_infos_columns": [ 10471 # "genename", 10472 # "Ensembl_geneid", 10473 # "LIST_S2_score", 10474 # "LIST_S2_pred", 10475 # ], 10476 # }, 10477 # { 10478 # "transcripts_column": "Ensembl_transcriptid", 10479 # "transcripts_infos_columns": [ 10480 # "genename", 10481 # "VARITY_R_score", 10482 # "Aloft_pred", 10483 # ], 10484 # }, 10485 # ], 10486 10487 # Init 10488 if temporary_tables is None: 10489 temporary_tables = [] 10490 if annotation_fields is None: 10491 annotation_fields = [] 10492 10493 # Variants table 10494 table_variants = self.get_table_variants() 10495 10496 for columns_map in columns_maps: 10497 10498 # Transcript column 10499 transcripts_column = columns_map.get("transcripts_column", None) 10500 10501 # Transcripts infos columns 10502 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10503 10504 # Transcripts infos columns rename 10505 column_rename = columns_map.get("column_rename", column_rename) 10506 10507 # Transcripts infos columns clean 10508 column_clean = columns_map.get("column_clean", column_clean) 10509 10510 # Transcripts infos columns case 10511 column_case = columns_map.get("column_case", column_case) 10512 10513 if transcripts_column is not None: 10514 10515 # Explode 10516 added_columns += self.explode_infos( 10517 fields=[transcripts_column] + transcripts_infos_columns 10518 ) 10519 10520 # View clauses 10521 clause_select_variants = [] 10522 clause_select_tanscripts = [] 10523 for field in [transcripts_column] + transcripts_infos_columns: 10524 10525 # AS field 10526 as_field = field 10527 10528 # Rename 10529 if column_rename: 10530 as_field = column_rename.get(as_field, as_field) 10531 10532 # Clean 10533 if column_clean: 10534 as_field = clean_annotation_field(as_field) 10535 10536 # Case 10537 if column_case: 10538 if column_case.lower() in ["lower"]: 10539 as_field = as_field.lower() 10540 elif column_case.lower() in ["upper"]: 10541 as_field = as_field.upper() 10542 10543 # Clause select Variants 10544 clause_select_variants.append( 10545 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10546 ) 10547 10548 if field in [transcripts_column]: 10549 clause_select_tanscripts.append( 10550 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10551 ) 10552 else: 10553 clause_select_tanscripts.append( 10554 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10555 ) 10556 annotation_fields.append(as_field) 10557 10558 # Querey View 10559 query = f""" 10560 SELECT 10561 "#CHROM", POS, REF, ALT, INFO, 10562 "{transcripts_column}" AS 'transcript', 10563 {", ".join(clause_select_tanscripts)} 10564 FROM ( 10565 SELECT 10566 "#CHROM", POS, REF, ALT, INFO, 10567 {", ".join(clause_select_variants)} 10568 FROM {table_variants} 10569 ) 10570 WHERE "{transcripts_column}" IS NOT NULL 10571 """ 10572 10573 # Create temporary table 10574 temporary_table = transcripts_table + "".join( 10575 random.choices(string.ascii_uppercase + string.digits, k=10) 10576 ) 10577 10578 # Temporary_tables 10579 temporary_tables.append(temporary_table) 10580 query_view = f""" 10581 CREATE TEMPORARY TABLE {temporary_table} 10582 AS ({query}) 10583 """ 10584 self.execute_query(query=query_view) 10585 10586 return added_columns, temporary_tables, annotation_fields 10587 10588 def create_transcript_view_from_column_format( 10589 self, 10590 transcripts_table: str = "transcripts", 10591 column_formats: dict = {}, 10592 temporary_tables: list = None, 10593 annotation_fields: list = None, 10594 column_rename: dict = {}, 10595 column_clean: bool = False, 10596 column_case: str = None, 10597 ) -> tuple[list, list, list]: 10598 """ 10599 The `create_transcript_view_from_column_format` function generates a transcript view based on 10600 specified column formats, adds additional columns and annotation fields, and returns the list of 10601 temporary tables and annotation fields. 10602 10603 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10604 of the table containing the transcripts data. This table will be used as the base table for 10605 creating the transcript view. The default value for this parameter is "transcripts", but you can 10606 provide a different table name if needed, defaults to transcripts 10607 :type transcripts_table: str (optional) 10608 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10609 about the columns to be used for creating the transcript view. Each entry in the dictionary 10610 specifies the mapping between a transcripts column and a transcripts infos column. This 10611 parameter allows you to define how the columns from the transcripts table should be transformed 10612 or mapped 10613 :type column_formats: dict 10614 :param temporary_tables: The `temporary_tables` parameter in the 10615 `create_transcript_view_from_column_format` function is a list that stores the names of 10616 temporary views created during the process of creating a transcript view from a column format. 10617 These temporary views are used to manipulate and extract data before generating the final 10618 transcript view 10619 :type temporary_tables: list 10620 :param annotation_fields: The `annotation_fields` parameter in the 10621 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10622 that are extracted from the temporary views created during the process. These annotation fields 10623 are obtained by querying the temporary views and extracting the column names excluding specific 10624 columns like `#CH 10625 :type annotation_fields: list 10626 :param column_rename: The `column_rename` parameter in the 10627 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10628 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10629 column names to new column names in this dictionary, you can rename specific columns during the 10630 process 10631 :type column_rename: dict 10632 :param column_clean: The `column_clean` parameter in the 10633 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10634 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10635 will be cleaned during the creation of the transcript view based on the specified column format, 10636 defaults to False 10637 :type column_clean: bool (optional) 10638 :param column_case: The `column_case` parameter in the 10639 `create_transcript_view_from_column_format` function is used to specify the case transformation 10640 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10641 to convert the column names to uppercase or lowercase, respectively 10642 :type column_case: str 10643 :return: The `create_transcript_view_from_column_format` function returns two lists: 10644 `temporary_tables` and `annotation_fields`. 10645 """ 10646 10647 log.debug("Start transcrpts view creation from column format...") 10648 10649 # "from_column_format": [ 10650 # { 10651 # "transcripts_column": "ANN", 10652 # "transcripts_infos_column": "Feature_ID", 10653 # } 10654 # ], 10655 10656 # Init 10657 if temporary_tables is None: 10658 temporary_tables = [] 10659 if annotation_fields is None: 10660 annotation_fields = [] 10661 10662 for column_format in column_formats: 10663 10664 # annotation field and transcript annotation field 10665 annotation_field = column_format.get("transcripts_column", "ANN") 10666 transcript_annotation = column_format.get( 10667 "transcripts_infos_column", "Feature_ID" 10668 ) 10669 10670 # Transcripts infos columns rename 10671 column_rename = column_format.get("column_rename", column_rename) 10672 10673 # Transcripts infos columns clean 10674 column_clean = column_format.get("column_clean", column_clean) 10675 10676 # Transcripts infos columns case 10677 column_case = column_format.get("column_case", column_case) 10678 10679 # Temporary View name 10680 temporary_view_name = transcripts_table + "".join( 10681 random.choices(string.ascii_uppercase + string.digits, k=10) 10682 ) 10683 10684 # Create temporary view name 10685 temporary_view_name = self.annotation_format_to_table( 10686 uniquify=True, 10687 annotation_field=annotation_field, 10688 view_name=temporary_view_name, 10689 annotation_id=transcript_annotation, 10690 column_rename=column_rename, 10691 column_clean=column_clean, 10692 column_case=column_case, 10693 ) 10694 10695 # Annotation fields 10696 if temporary_view_name: 10697 query_annotation_fields = f""" 10698 SELECT * 10699 FROM ( 10700 DESCRIBE SELECT * 10701 FROM {temporary_view_name} 10702 ) 10703 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10704 """ 10705 df_annotation_fields = self.get_query_to_df( 10706 query=query_annotation_fields 10707 ) 10708 10709 # Add temporary view and annotation fields 10710 temporary_tables.append(temporary_view_name) 10711 annotation_fields += list(set(df_annotation_fields["column_name"])) 10712 10713 return temporary_tables, annotation_fields 10714 10715 def create_transcript_view( 10716 self, 10717 transcripts_table: str = None, 10718 transcripts_table_drop: bool = True, 10719 param: dict = {}, 10720 ) -> str: 10721 """ 10722 The `create_transcript_view` function generates a transcript view by processing data from a 10723 specified table based on provided parameters and structural information. 10724 10725 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10726 is used to specify the name of the table that will store the final transcript view data. If a table 10727 name is not provided, the function will create a new table to store the transcript view data, and by 10728 default,, defaults to transcripts 10729 :type transcripts_table: str (optional) 10730 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10731 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10732 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10733 the function will drop the existing transcripts table if it exists, defaults to True 10734 :type transcripts_table_drop: bool (optional) 10735 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10736 contains information needed to create a transcript view. It includes details such as the structure 10737 of the transcripts, columns mapping, column formats, and other necessary information for generating 10738 the view. This parameter allows for flexibility and customization 10739 :type param: dict 10740 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10741 created or modified during the execution of the function. 10742 """ 10743 10744 log.debug("Start transcripts view creation...") 10745 10746 # Default 10747 transcripts_table_default = "transcripts" 10748 10749 # Param 10750 if not param: 10751 param = self.get_param() 10752 10753 # Struct 10754 struct = param.get("transcripts", {}).get("struct", None) 10755 10756 # Transcript veresion 10757 transcript_id_remove_version = param.get("transcripts", {}).get( 10758 "transcript_id_remove_version", False 10759 ) 10760 10761 # Transcripts mapping 10762 transcript_id_mapping_file = param.get("transcripts", {}).get( 10763 "transcript_id_mapping_file", None 10764 ) 10765 10766 # Transcripts mapping 10767 transcript_id_mapping_force = param.get("transcripts", {}).get( 10768 "transcript_id_mapping_force", None 10769 ) 10770 10771 if struct: 10772 10773 # Transcripts table 10774 if transcripts_table is None: 10775 transcripts_table = param.get("transcripts", {}).get( 10776 "table", transcripts_table_default 10777 ) 10778 10779 # added_columns 10780 added_columns = [] 10781 10782 # Temporary tables 10783 temporary_tables = [] 10784 10785 # Annotation fields 10786 annotation_fields = [] 10787 10788 # from columns map 10789 columns_maps = struct.get("from_columns_map", []) 10790 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10791 self.create_transcript_view_from_columns_map( 10792 transcripts_table=transcripts_table, 10793 columns_maps=columns_maps, 10794 added_columns=added_columns, 10795 temporary_tables=temporary_tables, 10796 annotation_fields=annotation_fields, 10797 ) 10798 ) 10799 added_columns += added_columns_tmp 10800 temporary_tables += temporary_tables_tmp 10801 annotation_fields += annotation_fields_tmp 10802 10803 # from column format 10804 column_formats = struct.get("from_column_format", []) 10805 temporary_tables_tmp, annotation_fields_tmp = ( 10806 self.create_transcript_view_from_column_format( 10807 transcripts_table=transcripts_table, 10808 column_formats=column_formats, 10809 temporary_tables=temporary_tables, 10810 annotation_fields=annotation_fields, 10811 ) 10812 ) 10813 temporary_tables += temporary_tables_tmp 10814 annotation_fields += annotation_fields_tmp 10815 10816 # Remove some specific fields/column 10817 annotation_fields = list(set(annotation_fields)) 10818 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10819 if field in annotation_fields: 10820 annotation_fields.remove(field) 10821 10822 # Merge temporary tables query 10823 query_merge = "" 10824 for temporary_table in list(set(temporary_tables)): 10825 10826 # First temporary table 10827 if not query_merge: 10828 query_merge = f""" 10829 SELECT * FROM {temporary_table} 10830 """ 10831 # other temporary table (using UNION) 10832 else: 10833 query_merge += f""" 10834 UNION BY NAME SELECT * FROM {temporary_table} 10835 """ 10836 10837 # transcript table tmp 10838 transcript_table_tmp = "transcripts_tmp" 10839 transcript_table_tmp2 = "transcripts_tmp2" 10840 transcript_table_tmp3 = "transcripts_tmp3" 10841 10842 # Merge on transcript 10843 query_merge_on_transcripts_annotation_fields = [] 10844 10845 # Add transcript list 10846 query_merge_on_transcripts_annotation_fields.append( 10847 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10848 ) 10849 10850 # Aggregate all annotations fields 10851 for annotation_field in set(annotation_fields): 10852 query_merge_on_transcripts_annotation_fields.append( 10853 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10854 ) 10855 10856 # Transcripts mapping 10857 if transcript_id_mapping_file: 10858 10859 # Transcript dataframe 10860 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10861 transcript_id_mapping_dataframe = transcripts_file_to_df( 10862 transcript_id_mapping_file, column_names=["transcript", "alias"] 10863 ) 10864 10865 # Transcript version remove 10866 if transcript_id_remove_version: 10867 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10868 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10869 query_left_join = f""" 10870 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10871 """ 10872 else: 10873 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10874 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10875 query_left_join = f""" 10876 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10877 """ 10878 10879 # Transcript column for group by merge 10880 query_transcript_merge_group_by = """ 10881 CASE 10882 WHEN transcript_mapped NOT IN ('') 10883 THEN split_part(transcript_mapped, '.', 1) 10884 ELSE split_part(transcript_original, '.', 1) 10885 END 10886 """ 10887 10888 # Merge query 10889 transcripts_tmp2_query = f""" 10890 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10891 FROM ({query_merge}) AS {transcript_table_tmp} 10892 {query_left_join} 10893 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10894 """ 10895 10896 # Retrive columns after mege 10897 transcripts_tmp2_describe_query = f""" 10898 DESCRIBE {transcripts_tmp2_query} 10899 """ 10900 transcripts_tmp2_describe_list = list( 10901 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10902 "column_name" 10903 ] 10904 ) 10905 10906 # Create list of columns for select clause 10907 transcripts_tmp2_describe_select_clause = [] 10908 for field in transcripts_tmp2_describe_list: 10909 if field not in [ 10910 "#CHROM", 10911 "POS", 10912 "REF", 10913 "ALT", 10914 "INFO", 10915 "transcript_mapped", 10916 ]: 10917 as_field = field 10918 if field in ["transcript_original"]: 10919 as_field = "transcripts_mapped" 10920 transcripts_tmp2_describe_select_clause.append( 10921 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 10922 ) 10923 10924 # Merge with mapping 10925 query_merge_on_transcripts = f""" 10926 SELECT 10927 "#CHROM", POS, REF, ALT, INFO, 10928 CASE 10929 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 10930 THEN ANY_VALUE(transcript_mapped) 10931 ELSE ANY_VALUE(transcript_original) 10932 END AS transcript, 10933 {", ".join(transcripts_tmp2_describe_select_clause)} 10934 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 10935 GROUP BY "#CHROM", POS, REF, ALT, INFO, 10936 {query_transcript_merge_group_by} 10937 """ 10938 10939 # Add transcript filter from mapping file 10940 if transcript_id_mapping_force: 10941 query_merge_on_transcripts = f""" 10942 SELECT * 10943 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 10944 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 10945 """ 10946 10947 # No transcript mapping 10948 else: 10949 10950 # Remove transcript version 10951 if transcript_id_remove_version: 10952 query_transcript_column = f""" 10953 split_part({transcript_table_tmp}.transcript, '.', 1) 10954 """ 10955 else: 10956 query_transcript_column = """ 10957 transcript 10958 """ 10959 10960 # Query sections 10961 query_transcript_column_select = ( 10962 f"{query_transcript_column} AS transcript" 10963 ) 10964 query_transcript_column_group_by = query_transcript_column 10965 10966 # Query for transcripts view 10967 query_merge_on_transcripts = f""" 10968 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 10969 FROM ({query_merge}) AS {transcript_table_tmp} 10970 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 10971 """ 10972 10973 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 10974 10975 # Drop transcript view is necessary 10976 if transcripts_table_drop: 10977 query_drop = f""" 10978 DROP TABLE IF EXISTS {transcripts_table}; 10979 """ 10980 self.execute_query(query=query_drop) 10981 10982 # Merge and create transcript view 10983 query_create_view = f""" 10984 CREATE TABLE IF NOT EXISTS {transcripts_table} 10985 AS {query_merge_on_transcripts} 10986 """ 10987 self.execute_query(query=query_create_view) 10988 10989 # Remove added columns 10990 for added_column in added_columns: 10991 self.drop_column(column=added_column) 10992 10993 else: 10994 10995 transcripts_table = None 10996 10997 return transcripts_table 10998 10999 def annotation_format_to_table( 11000 self, 11001 uniquify: bool = True, 11002 annotation_field: str = "ANN", 11003 annotation_id: str = "Feature_ID", 11004 view_name: str = "transcripts", 11005 column_rename: dict = {}, 11006 column_clean: bool = False, 11007 column_case: str = None, 11008 ) -> str: 11009 """ 11010 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11011 structured table format, ensuring unique values and creating a temporary table for further 11012 processing or analysis. 11013 11014 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11015 unique values in the output or not. If set to `True`, the function will make sure that the 11016 output values are unique, defaults to True 11017 :type uniquify: bool (optional) 11018 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11019 that contains the annotation information for each variant. This field is used to extract the 11020 annotation details for further processing in the function. By default, it is set to "ANN", 11021 defaults to ANN 11022 :type annotation_field: str (optional) 11023 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11024 is used to specify the identifier for the annotation feature. This identifier will be used as a 11025 column name in the resulting table or view that is created based on the annotation data. It 11026 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11027 :type annotation_id: str (optional) 11028 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11029 to specify the name of the temporary table that will be created to store the transformed 11030 annotation data. This table will hold the extracted information from the annotation field in a 11031 structured format for further processing or analysis. By default,, defaults to transcripts 11032 :type view_name: str (optional) 11033 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11034 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11035 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11036 created based on the annotation data. This feature enables 11037 :type column_rename: dict 11038 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11039 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11040 If set to `True`, the function will clean the annotation field before further processing. This 11041 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11042 to False 11043 :type column_clean: bool (optional) 11044 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11045 used to specify the case transformation to be applied to the column names extracted from the 11046 annotation data. It allows you to set the case of the column names to either lowercase or 11047 uppercase for consistency or other specific requirements during the conversion 11048 :type column_case: str 11049 :return: The function `annotation_format_to_table` is returning the name of the view created, 11050 which is stored in the variable `view_name`. 11051 """ 11052 11053 # Annotation field 11054 annotation_format = "annotation_explode" 11055 11056 # Transcript annotation 11057 if column_rename: 11058 annotation_id = column_rename.get(annotation_id, annotation_id) 11059 11060 if column_clean: 11061 annotation_id = clean_annotation_field(annotation_id) 11062 11063 # Prefix 11064 prefix = self.get_explode_infos_prefix() 11065 if prefix: 11066 prefix = "INFO/" 11067 11068 # Annotation fields 11069 annotation_infos = prefix + annotation_field 11070 annotation_format_infos = prefix + annotation_format 11071 11072 # Variants table 11073 table_variants = self.get_table_variants() 11074 11075 # Header 11076 vcf_reader = self.get_header() 11077 11078 # Add columns 11079 added_columns = [] 11080 11081 # Explode HGVS field in column 11082 added_columns += self.explode_infos(fields=[annotation_field]) 11083 11084 if annotation_field in vcf_reader.infos: 11085 11086 # Extract ANN header 11087 ann_description = vcf_reader.infos[annotation_field].desc 11088 pattern = r"'(.+?)'" 11089 match = re.search(pattern, ann_description) 11090 if match: 11091 ann_header_match = match.group(1).split(" | ") 11092 ann_header = [] 11093 ann_header_desc = {} 11094 for i in range(len(ann_header_match)): 11095 ann_header_info = "".join( 11096 char for char in ann_header_match[i] if char.isalnum() 11097 ) 11098 ann_header.append(ann_header_info) 11099 ann_header_desc[ann_header_info] = ann_header_match[i] 11100 if not ann_header_desc: 11101 raise ValueError("Invalid header description format") 11102 else: 11103 raise ValueError("Invalid header description format") 11104 11105 # Create variant id 11106 variant_id_column = self.get_variant_id_column() 11107 added_columns += [variant_id_column] 11108 11109 # Create dataframe 11110 dataframe_annotation_format = self.get_query_to_df( 11111 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11112 ) 11113 11114 # Create annotation columns 11115 dataframe_annotation_format[ 11116 annotation_format_infos 11117 ] = dataframe_annotation_format[annotation_infos].apply( 11118 lambda x: explode_annotation_format( 11119 annotation=str(x), 11120 uniquify=uniquify, 11121 output_format="JSON", 11122 prefix="", 11123 header=list(ann_header_desc.values()), 11124 ) 11125 ) 11126 11127 # Find keys 11128 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11129 df_keys = self.get_query_to_df(query=query_json) 11130 11131 # Check keys 11132 query_json_key = [] 11133 for _, row in df_keys.iterrows(): 11134 11135 # Key 11136 key = row.iloc[0] 11137 key_clean = key 11138 11139 # key rename 11140 if column_rename: 11141 key_clean = column_rename.get(key_clean, key_clean) 11142 11143 # key clean 11144 if column_clean: 11145 key_clean = clean_annotation_field(key_clean) 11146 11147 # Key case 11148 if column_case: 11149 if column_case.lower() in ["lower"]: 11150 key_clean = key_clean.lower() 11151 elif column_case.lower() in ["upper"]: 11152 key_clean = key_clean.upper() 11153 11154 # Type 11155 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11156 11157 # Get DataFrame from query 11158 df_json_type = self.get_query_to_df(query=query_json_type) 11159 11160 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11161 with pd.option_context("future.no_silent_downcasting", True): 11162 df_json_type.fillna(value="", inplace=True) 11163 replace_dict = {None: np.nan, "": np.nan} 11164 df_json_type.replace(replace_dict, inplace=True) 11165 df_json_type.dropna(inplace=True) 11166 11167 # Detect column type 11168 column_type = detect_column_type(df_json_type[key_clean]) 11169 11170 # Append 11171 query_json_key.append( 11172 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11173 ) 11174 11175 # Create view 11176 query_view = f""" 11177 CREATE TEMPORARY TABLE {view_name} 11178 AS ( 11179 SELECT *, {annotation_id} AS 'transcript' 11180 FROM ( 11181 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11182 FROM dataframe_annotation_format 11183 ) 11184 ); 11185 """ 11186 self.execute_query(query=query_view) 11187 11188 else: 11189 11190 # Return None 11191 view_name = None 11192 11193 # Remove added columns 11194 for added_column in added_columns: 11195 self.drop_column(column=added_column) 11196 11197 return view_name 11198 11199 def transcript_view_to_variants( 11200 self, 11201 transcripts_table: str = None, 11202 transcripts_column_id: str = None, 11203 transcripts_info_json: str = None, 11204 transcripts_info_field_json: str = None, 11205 transcripts_info_format: str = None, 11206 transcripts_info_field_format: str = None, 11207 param: dict = {}, 11208 ) -> bool: 11209 """ 11210 The `transcript_view_to_variants` function updates a variants table with information from 11211 transcripts in JSON format. 11212 11213 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11214 table containing the transcripts data. If this parameter is not provided, the function will 11215 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11216 :type transcripts_table: str 11217 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11218 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11219 identifier is used to match transcripts with variants in the database 11220 :type transcripts_column_id: str 11221 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11222 of the column in the variants table where the transcripts information will be stored in JSON 11223 format. This parameter allows you to define the column in the variants table that will hold the 11224 JSON-formatted information about transcripts 11225 :type transcripts_info_json: str 11226 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11227 specify the field in the VCF header that will contain information about transcripts in JSON 11228 format. This field will be added to the VCF header as an INFO field with the specified name 11229 :type transcripts_info_field_json: str 11230 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11231 format of the information about transcripts that will be stored in the variants table. This 11232 format can be used to define how the transcript information will be structured or displayed 11233 within the variants table 11234 :type transcripts_info_format: str 11235 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11236 specify the field in the VCF header that will contain information about transcripts in a 11237 specific format. This field will be added to the VCF header as an INFO field with the specified 11238 name 11239 :type transcripts_info_field_format: str 11240 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11241 that contains various configuration settings related to transcripts. It is used to provide 11242 default values for certain parameters if they are not explicitly provided when calling the 11243 method. The `param` dictionary can be passed as an argument 11244 :type param: dict 11245 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11246 if the operation is successful and `False` if certain conditions are not met. 11247 """ 11248 11249 msg_info_prefix = "Start transcripts view to variants annotations" 11250 11251 log.debug(f"{msg_info_prefix}...") 11252 11253 # Default 11254 transcripts_table_default = "transcripts" 11255 transcripts_column_id_default = "transcript" 11256 transcripts_info_json_default = None 11257 transcripts_info_format_default = None 11258 transcripts_info_field_json_default = None 11259 transcripts_info_field_format_default = None 11260 11261 # Param 11262 if not param: 11263 param = self.get_param() 11264 11265 # Transcripts table 11266 if transcripts_table is None: 11267 transcripts_table = param.get("transcripts", {}).get( 11268 "table", transcripts_table_default 11269 ) 11270 11271 # Transcripts column ID 11272 if transcripts_column_id is None: 11273 transcripts_column_id = param.get("transcripts", {}).get( 11274 "column_id", transcripts_column_id_default 11275 ) 11276 11277 # Transcripts info json 11278 if transcripts_info_json is None: 11279 transcripts_info_json = param.get("transcripts", {}).get( 11280 "transcripts_info_json", transcripts_info_json_default 11281 ) 11282 11283 # Transcripts info field JSON 11284 if transcripts_info_field_json is None: 11285 transcripts_info_field_json = param.get("transcripts", {}).get( 11286 "transcripts_info_field_json", transcripts_info_field_json_default 11287 ) 11288 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11289 # transcripts_info_json = transcripts_info_field_json 11290 11291 # Transcripts info format 11292 if transcripts_info_format is None: 11293 transcripts_info_format = param.get("transcripts", {}).get( 11294 "transcripts_info_format", transcripts_info_format_default 11295 ) 11296 11297 # Transcripts info field FORMAT 11298 if transcripts_info_field_format is None: 11299 transcripts_info_field_format = param.get("transcripts", {}).get( 11300 "transcripts_info_field_format", transcripts_info_field_format_default 11301 ) 11302 # if ( 11303 # transcripts_info_field_format is not None 11304 # and transcripts_info_format is None 11305 # ): 11306 # transcripts_info_format = transcripts_info_field_format 11307 11308 # Variants table 11309 table_variants = self.get_table_variants() 11310 11311 # Check info columns param 11312 if ( 11313 transcripts_info_json is None 11314 and transcripts_info_field_json is None 11315 and transcripts_info_format is None 11316 and transcripts_info_field_format is None 11317 ): 11318 return False 11319 11320 # Transcripts infos columns 11321 query_transcripts_infos_columns = f""" 11322 SELECT * 11323 FROM ( 11324 DESCRIBE SELECT * FROM {transcripts_table} 11325 ) 11326 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11327 """ 11328 transcripts_infos_columns = list( 11329 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11330 ) 11331 11332 # View results 11333 clause_select = [] 11334 clause_to_json = [] 11335 clause_to_format = [] 11336 for field in transcripts_infos_columns: 11337 # Do not consider INFO field for export into fields 11338 if field not in ["INFO"]: 11339 clause_select.append( 11340 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11341 ) 11342 clause_to_json.append(f""" '{field}': "{field}" """) 11343 clause_to_format.append(f""" "{field}" """) 11344 11345 # Update 11346 update_set_json = [] 11347 update_set_format = [] 11348 11349 # VCF header 11350 vcf_reader = self.get_header() 11351 11352 # Transcripts to info column in JSON 11353 if transcripts_info_json: 11354 11355 # Create column on variants table 11356 self.add_column( 11357 table_name=table_variants, 11358 column_name=transcripts_info_json, 11359 column_type="JSON", 11360 default_value=None, 11361 drop=False, 11362 ) 11363 11364 # Add header 11365 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11366 transcripts_info_json, 11367 ".", 11368 "String", 11369 "Transcripts in JSON format", 11370 "unknwon", 11371 "unknwon", 11372 self.code_type_map["String"], 11373 ) 11374 11375 # Add to update 11376 update_set_json.append( 11377 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11378 ) 11379 11380 # Transcripts to info field in JSON 11381 if transcripts_info_field_json: 11382 11383 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11384 11385 # Add to update 11386 update_set_json.append( 11387 f""" 11388 INFO = concat( 11389 CASE 11390 WHEN INFO NOT IN ('', '.') 11391 THEN INFO 11392 ELSE '' 11393 END, 11394 CASE 11395 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11396 THEN concat( 11397 ';{transcripts_info_field_json}=', 11398 t.{transcripts_info_json} 11399 ) 11400 ELSE '' 11401 END 11402 ) 11403 """ 11404 ) 11405 11406 # Add header 11407 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11408 transcripts_info_field_json, 11409 ".", 11410 "String", 11411 "Transcripts in JSON format", 11412 "unknwon", 11413 "unknwon", 11414 self.code_type_map["String"], 11415 ) 11416 11417 if update_set_json: 11418 11419 # Update query 11420 query_update = f""" 11421 UPDATE {table_variants} 11422 SET {", ".join(update_set_json)} 11423 FROM 11424 ( 11425 SELECT 11426 "#CHROM", POS, REF, ALT, 11427 concat( 11428 '{{', 11429 string_agg( 11430 '"' || "{transcripts_column_id}" || '":' || 11431 to_json(json_output) 11432 ), 11433 '}}' 11434 )::JSON AS {transcripts_info_json} 11435 FROM 11436 ( 11437 SELECT 11438 "#CHROM", POS, REF, ALT, 11439 "{transcripts_column_id}", 11440 to_json( 11441 {{{",".join(clause_to_json)}}} 11442 )::JSON AS json_output 11443 FROM 11444 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11445 WHERE "{transcripts_column_id}" IS NOT NULL 11446 ) 11447 GROUP BY "#CHROM", POS, REF, ALT 11448 ) AS t 11449 WHERE {table_variants}."#CHROM" = t."#CHROM" 11450 AND {table_variants}."POS" = t."POS" 11451 AND {table_variants}."REF" = t."REF" 11452 AND {table_variants}."ALT" = t."ALT" 11453 """ 11454 11455 self.execute_query(query=query_update) 11456 11457 # Transcripts to info column in FORMAT 11458 if transcripts_info_format: 11459 11460 # Create column on variants table 11461 self.add_column( 11462 table_name=table_variants, 11463 column_name=transcripts_info_format, 11464 column_type="VARCHAR", 11465 default_value=None, 11466 drop=False, 11467 ) 11468 11469 # Add header 11470 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11471 transcripts_info_format, 11472 ".", 11473 "String", 11474 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11475 "unknwon", 11476 "unknwon", 11477 self.code_type_map["String"], 11478 ) 11479 11480 # Add to update 11481 update_set_format.append( 11482 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11483 ) 11484 11485 else: 11486 11487 # Set variable for internal queries 11488 transcripts_info_format = "transcripts_info_format" 11489 11490 # Transcripts to info field in JSON 11491 if transcripts_info_field_format: 11492 11493 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11494 11495 # Add to update 11496 update_set_format.append( 11497 f""" 11498 INFO = concat( 11499 CASE 11500 WHEN INFO NOT IN ('', '.') 11501 THEN INFO 11502 ELSE '' 11503 END, 11504 CASE 11505 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11506 THEN concat( 11507 ';{transcripts_info_field_format}=', 11508 t.{transcripts_info_format} 11509 ) 11510 ELSE '' 11511 END 11512 ) 11513 """ 11514 ) 11515 11516 # Add header 11517 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11518 transcripts_info_field_format, 11519 ".", 11520 "String", 11521 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11522 "unknwon", 11523 "unknwon", 11524 self.code_type_map["String"], 11525 ) 11526 11527 if update_set_format: 11528 11529 # Update query 11530 query_update = f""" 11531 UPDATE {table_variants} 11532 SET {", ".join(update_set_format)} 11533 FROM 11534 ( 11535 SELECT 11536 "#CHROM", POS, REF, ALT, 11537 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11538 FROM 11539 ( 11540 SELECT 11541 "#CHROM", POS, REF, ALT, 11542 "{transcripts_column_id}", 11543 concat( 11544 "{transcripts_column_id}", 11545 '|', 11546 {", '|', ".join(clause_to_format)} 11547 ) AS {transcripts_info_format} 11548 FROM 11549 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11550 ) 11551 GROUP BY "#CHROM", POS, REF, ALT 11552 ) AS t 11553 WHERE {table_variants}."#CHROM" = t."#CHROM" 11554 AND {table_variants}."POS" = t."POS" 11555 AND {table_variants}."REF" = t."REF" 11556 AND {table_variants}."ALT" = t."ALT" 11557 """ 11558 11559 self.execute_query(query=query_update) 11560 11561 return True
36class Variants: 37 38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data() 85 86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples 104 105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples 112 113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True) 123 124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "") 149 150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config 163 164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param 174 175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = [] 203 204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False) 212 213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config 254 255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict 279 280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db 309 310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}") 355 356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None 382 383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None 484 485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df 526 527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None 569 570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats 791 792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file 814 815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None 916 917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input 923 924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format 940 941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed 958 959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output 966 967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format 985 986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config 992 993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param 999 1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db 1006 1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix 1013 1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants 1041 1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 ) 1053 1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory") 1061 1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn 1069 1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close() 1076 1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required 1096 1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list 1110 1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0 1129 1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return "" 1140 1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return [] 1151 1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list) 1162 1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list 1221 1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False 1240 1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False) 1249 1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format 1261 1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False) 1315 1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes() 1511 1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False) 1521 1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return [] 1621 1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix 1640 1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column 1712 1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed 1770 1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns 1987 1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index) 2014 2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index) 2040 2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list 2055 2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f) 2074 2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None 2086 2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 ) -> bool: 2100 """ 2101 The `export_output` function exports data from a VCF file to a specified output file in various 2102 formats, including VCF, CSV, TSV, PSV, and Parquet. 2103 2104 :param output_file: The `output_file` parameter is a string that specifies the name of the 2105 output file to be generated by the function. This is where the exported data will be saved 2106 :type output_file: str 2107 :param output_header: The `output_header` parameter is a string that specifies the name of the 2108 file where the header of the VCF file will be exported. If this parameter is not provided, the 2109 header will be exported to a file with the same name as the `output_file` parameter, but with 2110 the extension " 2111 :type output_header: str 2112 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2113 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2114 True, the header will be exported to a file. If `export_header` is False, the header will not 2115 be, defaults to True, if output format is not VCF 2116 :type export_header: bool (optional) 2117 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2118 select specific data from the VCF file before exporting it. If provided, only the data that 2119 matches the query will be exported 2120 :type query: str 2121 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2122 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2123 organize data in a hierarchical directory structure based on the values of one or more columns. 2124 This can improve query performance when working with large datasets 2125 :type parquet_partitions: list 2126 :param chunk_size: The `chunk_size` parameter specifies the number of 2127 records in batch when exporting data in Parquet format. This parameter is used for 2128 partitioning the Parquet file into multiple files. 2129 :type chunk_size: int 2130 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2131 threads to be used during the export process. It determines the level of parallelism and can 2132 improve the performance of the export operation. If not provided, the function will use the 2133 default number of threads 2134 :type threads: int 2135 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2136 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2137 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2138 False 2139 :type sort: bool (optional) 2140 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2141 created on the output file. If `index` is True, an index will be created. If `index` is False, 2142 no index will be created. The default value is False, defaults to False 2143 :type index: bool (optional) 2144 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2145 sorting the output file. This parameter is only applicable when exporting data in VCF format 2146 :type order_by: str 2147 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2148 None if it doesn't. 2149 """ 2150 2151 # Log 2152 log.info("Exporting...") 2153 2154 # Full path 2155 output_file = full_path(output_file) 2156 output_header = full_path(output_header) 2157 2158 # Config 2159 config = self.get_config() 2160 2161 # Param 2162 param = self.get_param() 2163 2164 # Tmp files to remove 2165 tmp_to_remove = [] 2166 2167 # If no output, get it 2168 if not output_file: 2169 output_file = self.get_output() 2170 2171 # If not threads 2172 if not threads: 2173 threads = self.get_threads() 2174 2175 # Auto header name with extension 2176 if export_header or output_header: 2177 if not output_header: 2178 output_header = f"{output_file}.hdr" 2179 # Export header 2180 self.export_header(output_file=output_file) 2181 2182 # Switch off export header if VCF output 2183 output_file_type = get_file_format(output_file) 2184 if output_file_type in ["vcf"]: 2185 export_header = False 2186 tmp_to_remove.append(output_header) 2187 2188 # Chunk size 2189 if not chunk_size: 2190 chunk_size = config.get("chunk_size", None) 2191 2192 # Parquet partition 2193 if not parquet_partitions: 2194 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2195 if parquet_partitions and isinstance(parquet_partitions, str): 2196 parquet_partitions = parquet_partitions.split(",") 2197 2198 # Order by 2199 if not order_by: 2200 order_by = param.get("export", {}).get("order_by", "") 2201 2202 # Header in output 2203 header_in_output = param.get("export", {}).get("include_header", False) 2204 2205 # Database 2206 database_source = self.get_connexion() 2207 2208 # Connexion format 2209 connexion_format = self.get_connexion_format() 2210 2211 # Explode infos 2212 if self.get_explode_infos(): 2213 self.explode_infos( 2214 prefix=self.get_explode_infos_prefix(), 2215 fields=self.get_explode_infos_fields(), 2216 force=False, 2217 ) 2218 2219 # if connexion_format in ["sqlite"] or query: 2220 if connexion_format in ["sqlite"]: 2221 2222 # Export in Parquet 2223 random_tmp = "".join( 2224 random.choice(string.ascii_lowercase) for i in range(10) 2225 ) 2226 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2227 tmp_to_remove.append(database_source) 2228 2229 # Table Variants 2230 table_variants = self.get_table_variants() 2231 2232 # Create export query 2233 sql_query_export_subquery = f""" 2234 SELECT * FROM {table_variants} 2235 """ 2236 2237 # Write source file 2238 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2239 2240 # Create database 2241 database = Database( 2242 database=database_source, 2243 table="variants", 2244 header_file=output_header, 2245 conn_config=self.get_connexion_config(), 2246 ) 2247 2248 # Existing colomns header 2249 existing_columns_header = database.get_header_columns_from_database(query=query) 2250 2251 # Sample list 2252 if output_file_type in ["vcf"]: 2253 get_samples = self.get_samples() 2254 get_samples_check = self.get_samples_check() 2255 samples_force = get_samples is not None 2256 sample_list = self.get_header_sample_list( 2257 check=get_samples_check, 2258 samples=get_samples, 2259 samples_force=samples_force, 2260 ) 2261 else: 2262 sample_list = None 2263 2264 # Export file 2265 database.export( 2266 output_database=output_file, 2267 output_header=output_header, 2268 existing_columns_header=existing_columns_header, 2269 parquet_partitions=parquet_partitions, 2270 chunk_size=chunk_size, 2271 threads=threads, 2272 sort=sort, 2273 index=index, 2274 header_in_output=header_in_output, 2275 order_by=order_by, 2276 query=query, 2277 export_header=export_header, 2278 sample_list=sample_list, 2279 ) 2280 2281 # Remove 2282 remove_if_exists(tmp_to_remove) 2283 2284 return (os.path.exists(output_file) or None) and ( 2285 os.path.exists(output_file) or None 2286 ) 2287 2288 def get_extra_infos(self, table: str = None) -> list: 2289 """ 2290 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2291 in the header. 2292 2293 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2294 name of the table from which you want to retrieve the extra columns that are not present in the 2295 header. If the `table` parameter is not provided when calling the function, it will default to 2296 using the variants 2297 :type table: str 2298 :return: A list of columns that are in the specified table but not in the header of the table. 2299 """ 2300 2301 header_columns = [] 2302 2303 if not table: 2304 table = self.get_table_variants(clause="from") 2305 header_columns = self.get_header_columns() 2306 2307 # Check all columns in the database 2308 query = f""" SELECT * FROM {table} LIMIT 1 """ 2309 log.debug(f"query {query}") 2310 table_columns = self.get_query_to_df(query).columns.tolist() 2311 extra_columns = [] 2312 2313 # Construct extra infos (not in header) 2314 for column in table_columns: 2315 if column not in header_columns: 2316 extra_columns.append(column) 2317 2318 return extra_columns 2319 2320 def get_extra_infos_sql(self, table: str = None) -> str: 2321 """ 2322 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2323 by double quotes 2324 2325 :param table: The name of the table to get the extra infos from. If None, the default table is 2326 used 2327 :type table: str 2328 :return: A string of the extra infos 2329 """ 2330 2331 return ", ".join( 2332 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2333 ) 2334 2335 def export_header( 2336 self, 2337 header_name: str = None, 2338 output_file: str = None, 2339 output_file_ext: str = ".hdr", 2340 clean_header: bool = True, 2341 remove_chrom_line: bool = False, 2342 ) -> str: 2343 """ 2344 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2345 specified options, and writes it to a new file. 2346 2347 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2348 this parameter is not specified, the header will be written to the output file 2349 :type header_name: str 2350 :param output_file: The `output_file` parameter in the `export_header` function is used to 2351 specify the name of the output file where the header will be written. If this parameter is not 2352 provided, the header will be written to a temporary file 2353 :type output_file: str 2354 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2355 string that represents the extension of the output header file. By default, it is set to ".hdr" 2356 if not specified by the user. This extension will be appended to the `output_file` name to 2357 create the final, defaults to .hdr 2358 :type output_file_ext: str (optional) 2359 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2360 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2361 `True`, the function will clean the header by modifying certain lines based on a specific 2362 pattern. If `clean_header`, defaults to True 2363 :type clean_header: bool (optional) 2364 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2365 boolean flag that determines whether the #CHROM line should be removed from the header before 2366 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2367 defaults to False 2368 :type remove_chrom_line: bool (optional) 2369 :return: The function `export_header` returns the name of the temporary header file that is 2370 created. 2371 """ 2372 2373 if not header_name and not output_file: 2374 output_file = self.get_output() 2375 2376 if self.get_header(): 2377 2378 # Get header object 2379 header_obj = self.get_header() 2380 2381 # Create database 2382 db_for_header = Database(database=self.get_input()) 2383 2384 # Get real columns in the file 2385 db_header_columns = db_for_header.get_columns() 2386 2387 with tempfile.TemporaryDirectory() as tmpdir: 2388 2389 # Write header file 2390 header_file_tmp = os.path.join(tmpdir, "header") 2391 f = open(header_file_tmp, "w") 2392 vcf.Writer(f, header_obj) 2393 f.close() 2394 2395 # Replace #CHROM line with rel columns 2396 header_list = db_for_header.read_header_file( 2397 header_file=header_file_tmp 2398 ) 2399 header_list[-1] = "\t".join(db_header_columns) 2400 2401 # Remove CHROM line 2402 if remove_chrom_line: 2403 header_list.pop() 2404 2405 # Clean header 2406 if clean_header: 2407 header_list_clean = [] 2408 for head in header_list: 2409 # Clean head for malformed header 2410 head_clean = head 2411 head_clean = re.subn( 2412 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2413 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2414 head_clean, 2415 2, 2416 )[0] 2417 # Write header 2418 header_list_clean.append(head_clean) 2419 header_list = header_list_clean 2420 2421 tmp_header_name = output_file + output_file_ext 2422 2423 f = open(tmp_header_name, "w") 2424 for line in header_list: 2425 f.write(line) 2426 f.close() 2427 2428 return tmp_header_name 2429 2430 def export_variant_vcf( 2431 self, 2432 vcf_file, 2433 remove_info: bool = False, 2434 add_samples: bool = True, 2435 list_samples: list = [], 2436 where_clause: str = "", 2437 index: bool = False, 2438 threads: int | None = None, 2439 ) -> bool | None: 2440 """ 2441 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2442 remove INFO field, add samples, and control compression and indexing. 2443 2444 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2445 written to. It is the output file that will contain the filtered VCF data based on the specified 2446 parameters 2447 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2448 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2449 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2450 in, defaults to False 2451 :type remove_info: bool (optional) 2452 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2453 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2454 If set to False, the samples will be removed. The default value is True, defaults to True 2455 :type add_samples: bool (optional) 2456 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2457 in the output VCF file. By default, all samples will be included. If you provide a list of 2458 samples, only those samples will be included in the output file 2459 :type list_samples: list 2460 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2461 determines whether or not to create an index for the output VCF file. If `index` is set to 2462 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2463 :type index: bool (optional) 2464 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2465 number of threads to use for exporting the VCF file. It determines how many parallel threads 2466 will be used during the export process. More threads can potentially speed up the export process 2467 by utilizing multiple cores of the processor. If 2468 :type threads: int | None 2469 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2470 method with various parameters including the output file, query, threads, sort flag, and index 2471 flag. The `export_output` method is responsible for exporting the VCF data based on the 2472 specified parameters and configurations provided in the `export_variant_vcf` function. 2473 """ 2474 2475 # Config 2476 config = self.get_config() 2477 2478 # Extract VCF 2479 log.debug("Export VCF...") 2480 2481 # Table variants 2482 table_variants = self.get_table_variants() 2483 2484 # Threads 2485 if not threads: 2486 threads = self.get_threads() 2487 2488 # Info fields 2489 if remove_info: 2490 if not isinstance(remove_info, str): 2491 remove_info = "." 2492 info_field = f"""'{remove_info}' as INFO""" 2493 else: 2494 info_field = "INFO" 2495 2496 # Samples fields 2497 if add_samples: 2498 if not list_samples: 2499 list_samples = self.get_header_sample_list() 2500 if list_samples: 2501 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2502 else: 2503 samples_fields = "" 2504 log.debug(f"samples_fields: {samples_fields}") 2505 else: 2506 samples_fields = "" 2507 2508 # Where clause 2509 if where_clause is None: 2510 where_clause = "" 2511 2512 # Variants 2513 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2514 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2515 log.debug(f"sql_query_select={sql_query_select}") 2516 2517 return self.export_output( 2518 output_file=vcf_file, 2519 output_header=None, 2520 export_header=True, 2521 query=sql_query_select, 2522 parquet_partitions=None, 2523 chunk_size=config.get("chunk_size", None), 2524 threads=threads, 2525 sort=True, 2526 index=index, 2527 order_by=None, 2528 ) 2529 2530 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2531 """ 2532 It takes a list of commands and runs them in parallel using the number of threads specified 2533 2534 :param commands: A list of commands to run 2535 :param threads: The number of threads to use, defaults to 1 (optional) 2536 """ 2537 2538 run_parallel_commands(commands, threads) 2539 2540 def get_threads(self, default: int = 1) -> int: 2541 """ 2542 This function returns the number of threads to use for a job, with a default value of 1 if not 2543 specified. 2544 2545 :param default: The `default` parameter in the `get_threads` method is used to specify the 2546 default number of threads to use if no specific value is provided. If no value is provided for 2547 the `threads` parameter in the configuration or input parameters, the `default` value will be 2548 used, defaults to 1 2549 :type default: int (optional) 2550 :return: the number of threads to use for the current job. 2551 """ 2552 2553 # Config 2554 config = self.get_config() 2555 2556 # Param 2557 param = self.get_param() 2558 2559 # Input threads 2560 input_thread = param.get("threads", config.get("threads", None)) 2561 2562 # Check threads 2563 if not input_thread: 2564 threads = default 2565 elif int(input_thread) <= 0: 2566 threads = os.cpu_count() 2567 else: 2568 threads = int(input_thread) 2569 return threads 2570 2571 def get_memory(self, default: str = None) -> str: 2572 """ 2573 This function retrieves the memory value from parameters or configuration with a default value 2574 if not found. 2575 2576 :param default: The `get_memory` function takes in a default value as a string parameter. This 2577 default value is used as a fallback in case the `memory` parameter is not provided in the 2578 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2579 the function 2580 :type default: str 2581 :return: The `get_memory` function returns a string value representing the memory parameter. If 2582 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2583 return the default value provided as an argument to the function. 2584 """ 2585 2586 # Config 2587 config = self.get_config() 2588 2589 # Param 2590 param = self.get_param() 2591 2592 # Input threads 2593 input_memory = param.get("memory", config.get("memory", None)) 2594 2595 # Check threads 2596 if input_memory: 2597 memory = input_memory 2598 else: 2599 memory = default 2600 2601 return memory 2602 2603 def update_from_vcf(self, vcf_file: str) -> None: 2604 """ 2605 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2606 2607 :param vcf_file: the path to the VCF file 2608 """ 2609 2610 connexion_format = self.get_connexion_format() 2611 2612 if connexion_format in ["duckdb"]: 2613 self.update_from_vcf_duckdb(vcf_file) 2614 elif connexion_format in ["sqlite"]: 2615 self.update_from_vcf_sqlite(vcf_file) 2616 2617 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2618 """ 2619 It takes a VCF file and updates the INFO column of the variants table in the database with the 2620 INFO column of the VCF file 2621 2622 :param vcf_file: the path to the VCF file 2623 """ 2624 2625 # varaints table 2626 table_variants = self.get_table_variants() 2627 2628 # Loading VCF into temporaire table 2629 skip = self.get_header_length(file=vcf_file) 2630 vcf_df = pd.read_csv( 2631 vcf_file, 2632 sep="\t", 2633 engine="c", 2634 skiprows=skip, 2635 header=0, 2636 low_memory=False, 2637 ) 2638 sql_query_update = f""" 2639 UPDATE {table_variants} as table_variants 2640 SET INFO = concat( 2641 CASE 2642 WHEN INFO NOT IN ('', '.') 2643 THEN INFO 2644 ELSE '' 2645 END, 2646 ( 2647 SELECT 2648 concat( 2649 CASE 2650 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2651 THEN ';' 2652 ELSE '' 2653 END 2654 , 2655 CASE 2656 WHEN table_parquet.INFO NOT IN ('','.') 2657 THEN table_parquet.INFO 2658 ELSE '' 2659 END 2660 ) 2661 FROM vcf_df as table_parquet 2662 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2663 AND table_parquet.\"POS\" = table_variants.\"POS\" 2664 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2665 AND table_parquet.\"REF\" = table_variants.\"REF\" 2666 AND table_parquet.INFO NOT IN ('','.') 2667 ) 2668 ) 2669 ; 2670 """ 2671 self.conn.execute(sql_query_update) 2672 2673 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2674 """ 2675 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2676 table, then updates the INFO column of the variants table with the INFO column of the temporary 2677 table 2678 2679 :param vcf_file: The path to the VCF file you want to update the database with 2680 """ 2681 2682 # Create a temporary table for the VCF 2683 table_vcf = "tmp_vcf" 2684 sql_create = ( 2685 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2686 ) 2687 self.conn.execute(sql_create) 2688 2689 # Loading VCF into temporaire table 2690 vcf_df = pd.read_csv( 2691 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2692 ) 2693 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2694 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2695 2696 # Update table 'variants' with VCF data 2697 # warning: CONCAT as || operator 2698 sql_query_update = f""" 2699 UPDATE variants as table_variants 2700 SET INFO = CASE 2701 WHEN INFO NOT IN ('', '.') 2702 THEN INFO 2703 ELSE '' 2704 END || 2705 ( 2706 SELECT 2707 CASE 2708 WHEN table_variants.INFO NOT IN ('','.') 2709 AND table_vcf.INFO NOT IN ('','.') 2710 THEN ';' 2711 ELSE '' 2712 END || 2713 CASE 2714 WHEN table_vcf.INFO NOT IN ('','.') 2715 THEN table_vcf.INFO 2716 ELSE '' 2717 END 2718 FROM {table_vcf} as table_vcf 2719 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2720 AND table_vcf.\"POS\" = table_variants.\"POS\" 2721 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2722 AND table_vcf.\"REF\" = table_variants.\"REF\" 2723 ) 2724 """ 2725 self.conn.execute(sql_query_update) 2726 2727 # Drop temporary table 2728 sql_drop = f"DROP TABLE {table_vcf}" 2729 self.conn.execute(sql_drop) 2730 2731 def drop_variants_table(self) -> None: 2732 """ 2733 > This function drops the variants table 2734 """ 2735 2736 table_variants = self.get_table_variants() 2737 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2738 self.conn.execute(sql_table_variants) 2739 2740 def set_variant_id( 2741 self, variant_id_column: str = "variant_id", force: bool = None 2742 ) -> str: 2743 """ 2744 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2745 `#CHROM`, `POS`, `REF`, and `ALT` columns 2746 2747 :param variant_id_column: The name of the column to be created in the variants table, defaults 2748 to variant_id 2749 :type variant_id_column: str (optional) 2750 :param force: If True, the variant_id column will be created even if it already exists 2751 :type force: bool 2752 :return: The name of the column that contains the variant_id 2753 """ 2754 2755 # Assembly 2756 assembly = self.get_param().get( 2757 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2758 ) 2759 2760 # INFO/Tag prefix 2761 prefix = self.get_explode_infos_prefix() 2762 2763 # Explode INFO/SVTYPE 2764 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2765 2766 # variants table 2767 table_variants = self.get_table_variants() 2768 2769 # variant_id column 2770 if not variant_id_column: 2771 variant_id_column = "variant_id" 2772 2773 # Creta variant_id column 2774 if "variant_id" not in self.get_extra_infos() or force: 2775 2776 # Create column 2777 self.add_column( 2778 table_name=table_variants, 2779 column_name=variant_id_column, 2780 column_type="UBIGINT", 2781 default_value="0", 2782 ) 2783 2784 # Update column 2785 self.conn.execute( 2786 f""" 2787 UPDATE {table_variants} 2788 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2789 """ 2790 ) 2791 2792 # Remove added columns 2793 for added_column in added_columns: 2794 self.drop_column(column=added_column) 2795 2796 # return variant_id column name 2797 return variant_id_column 2798 2799 def get_variant_id_column( 2800 self, variant_id_column: str = "variant_id", force: bool = None 2801 ) -> str: 2802 """ 2803 This function returns the variant_id column name 2804 2805 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2806 defaults to variant_id 2807 :type variant_id_column: str (optional) 2808 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2809 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2810 if it is not already set, or if it is set 2811 :type force: bool 2812 :return: The variant_id column name. 2813 """ 2814 2815 return self.set_variant_id(variant_id_column=variant_id_column, force=force) 2816 2817 ### 2818 # Annotation 2819 ### 2820 2821 def scan_databases( 2822 self, 2823 database_formats: list = ["parquet"], 2824 database_releases: list = ["current"], 2825 ) -> dict: 2826 """ 2827 The function `scan_databases` scans for available databases based on specified formats and 2828 releases. 2829 2830 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2831 of the databases to be scanned. In this case, the accepted format is "parquet" 2832 :type database_formats: list ["parquet"] 2833 :param database_releases: The `database_releases` parameter is a list that specifies the 2834 releases of the databases to be scanned. In the provided function, the default value for 2835 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2836 databases that are in the "current" 2837 :type database_releases: list 2838 :return: The function `scan_databases` returns a dictionary containing information about 2839 databases that match the specified formats and releases. 2840 """ 2841 2842 # Config 2843 config = self.get_config() 2844 2845 # Param 2846 param = self.get_param() 2847 2848 # Param - Assembly 2849 assembly = param.get("assembly", config.get("assembly", None)) 2850 if not assembly: 2851 assembly = DEFAULT_ASSEMBLY 2852 log.warning(f"Default assembly '{assembly}'") 2853 2854 # Scan for availabled databases 2855 log.info( 2856 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2857 ) 2858 databases_infos_dict = databases_infos( 2859 database_folder_releases=database_releases, 2860 database_formats=database_formats, 2861 assembly=assembly, 2862 config=config, 2863 ) 2864 log.info( 2865 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2866 ) 2867 2868 return databases_infos_dict 2869 2870 def annotation(self) -> None: 2871 """ 2872 It annotates the VCF file with the annotations specified in the config file. 2873 """ 2874 2875 # Config 2876 config = self.get_config() 2877 2878 # Param 2879 param = self.get_param() 2880 2881 # Param - Assembly 2882 assembly = param.get("assembly", config.get("assembly", None)) 2883 if not assembly: 2884 assembly = DEFAULT_ASSEMBLY 2885 log.warning(f"Default assembly '{assembly}'") 2886 2887 # annotations databases folders 2888 annotations_databases = set( 2889 config.get("folders", {}) 2890 .get("databases", {}) 2891 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2892 + config.get("folders", {}) 2893 .get("databases", {}) 2894 .get("parquet", ["~/howard/databases/parquet/current"]) 2895 + config.get("folders", {}) 2896 .get("databases", {}) 2897 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2898 ) 2899 2900 # Get param annotations 2901 if param.get("annotations", None) and isinstance( 2902 param.get("annotations", None), str 2903 ): 2904 log.debug(param.get("annotations", None)) 2905 param_annotation_list = param.get("annotations").split(",") 2906 else: 2907 param_annotation_list = [] 2908 2909 # Each tools param 2910 if param.get("annotation_parquet", None) != None: 2911 log.debug( 2912 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2913 ) 2914 if isinstance(param.get("annotation_parquet", None), list): 2915 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2916 else: 2917 param_annotation_list.append(param.get("annotation_parquet")) 2918 if param.get("annotation_snpsift", None) != None: 2919 if isinstance(param.get("annotation_snpsift", None), list): 2920 param_annotation_list.append( 2921 "snpsift:" 2922 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2923 ) 2924 else: 2925 param_annotation_list.append( 2926 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2927 ) 2928 if param.get("annotation_snpeff", None) != None: 2929 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2930 if param.get("annotation_bcftools", None) != None: 2931 if isinstance(param.get("annotation_bcftools", None), list): 2932 param_annotation_list.append( 2933 "bcftools:" 2934 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2935 ) 2936 else: 2937 param_annotation_list.append( 2938 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2939 ) 2940 if param.get("annotation_annovar", None) != None: 2941 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2942 if param.get("annotation_exomiser", None) != None: 2943 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2944 if param.get("annotation_splice", None) != None: 2945 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2946 2947 # Merge param annotations list 2948 param["annotations"] = ",".join(param_annotation_list) 2949 2950 # debug 2951 log.debug(f"param_annotations={param['annotations']}") 2952 2953 if param.get("annotations"): 2954 2955 # Log 2956 # log.info("Annotations - Check annotation parameters") 2957 2958 if not "annotation" in param: 2959 param["annotation"] = {} 2960 2961 # List of annotations parameters 2962 annotations_list_input = {} 2963 if isinstance(param.get("annotations", None), str): 2964 annotation_file_list = [ 2965 value for value in param.get("annotations", "").split(",") 2966 ] 2967 for annotation_file in annotation_file_list: 2968 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2969 else: 2970 annotations_list_input = param.get("annotations", {}) 2971 2972 log.info(f"Quick Annotations:") 2973 for annotation_key in list(annotations_list_input.keys()): 2974 log.info(f" {annotation_key}") 2975 2976 # List of annotations and associated fields 2977 annotations_list = {} 2978 2979 for annotation_file in annotations_list_input: 2980 2981 # Explode annotations if ALL 2982 if ( 2983 annotation_file.upper() == "ALL" 2984 or annotation_file.upper().startswith("ALL:") 2985 ): 2986 2987 # check ALL parameters (formats, releases) 2988 annotation_file_split = annotation_file.split(":") 2989 database_formats = "parquet" 2990 database_releases = "current" 2991 for annotation_file_option in annotation_file_split[1:]: 2992 database_all_options_split = annotation_file_option.split("=") 2993 if database_all_options_split[0] == "format": 2994 database_formats = database_all_options_split[1].split("+") 2995 if database_all_options_split[0] == "release": 2996 database_releases = database_all_options_split[1].split("+") 2997 2998 # Scan for availabled databases 2999 databases_infos_dict = self.scan_databases( 3000 database_formats=database_formats, 3001 database_releases=database_releases, 3002 ) 3003 3004 # Add found databases in annotation parameters 3005 for database_infos in databases_infos_dict.keys(): 3006 annotations_list[database_infos] = {"INFO": None} 3007 3008 else: 3009 annotations_list[annotation_file] = annotations_list_input[ 3010 annotation_file 3011 ] 3012 3013 # Check each databases 3014 if len(annotations_list): 3015 3016 log.info( 3017 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3018 ) 3019 3020 for annotation_file in annotations_list: 3021 3022 # Init 3023 annotations = annotations_list.get(annotation_file, None) 3024 3025 # Annotation snpEff 3026 if annotation_file.startswith("snpeff"): 3027 3028 log.debug(f"Quick Annotation snpEff") 3029 3030 if "snpeff" not in param["annotation"]: 3031 param["annotation"]["snpeff"] = {} 3032 3033 if "options" not in param["annotation"]["snpeff"]: 3034 param["annotation"]["snpeff"]["options"] = "" 3035 3036 # snpEff options in annotations 3037 param["annotation"]["snpeff"]["options"] = "".join( 3038 annotation_file.split(":")[1:] 3039 ) 3040 3041 # Annotation Annovar 3042 elif annotation_file.startswith("annovar"): 3043 3044 log.debug(f"Quick Annotation Annovar") 3045 3046 if "annovar" not in param["annotation"]: 3047 param["annotation"]["annovar"] = {} 3048 3049 if "annotations" not in param["annotation"]["annovar"]: 3050 param["annotation"]["annovar"]["annotations"] = {} 3051 3052 # Options 3053 annotation_file_split = annotation_file.split(":") 3054 for annotation_file_annotation in annotation_file_split[1:]: 3055 if annotation_file_annotation: 3056 param["annotation"]["annovar"]["annotations"][ 3057 annotation_file_annotation 3058 ] = annotations 3059 3060 # Annotation Exomiser 3061 elif annotation_file.startswith("exomiser"): 3062 3063 log.debug(f"Quick Annotation Exomiser") 3064 3065 param["annotation"]["exomiser"] = params_string_to_dict( 3066 annotation_file 3067 ) 3068 3069 # Annotation Splice 3070 elif annotation_file.startswith("splice"): 3071 3072 log.debug(f"Quick Annotation Splice") 3073 3074 param["annotation"]["splice"] = params_string_to_dict( 3075 annotation_file 3076 ) 3077 3078 # Annotation Parquet or BCFTOOLS 3079 else: 3080 3081 # Tools detection 3082 if annotation_file.startswith("bcftools:"): 3083 annotation_tool_initial = "bcftools" 3084 annotation_file = ":".join(annotation_file.split(":")[1:]) 3085 elif annotation_file.startswith("snpsift:"): 3086 annotation_tool_initial = "snpsift" 3087 annotation_file = ":".join(annotation_file.split(":")[1:]) 3088 elif annotation_file.startswith("bigwig:"): 3089 annotation_tool_initial = "bigwig" 3090 annotation_file = ":".join(annotation_file.split(":")[1:]) 3091 else: 3092 annotation_tool_initial = None 3093 3094 # list of files 3095 annotation_file_list = annotation_file.replace("+", ":").split( 3096 ":" 3097 ) 3098 3099 for annotation_file in annotation_file_list: 3100 3101 if annotation_file: 3102 3103 # Annotation tool initial 3104 annotation_tool = annotation_tool_initial 3105 3106 # Find file 3107 annotation_file_found = None 3108 3109 if os.path.exists(annotation_file): 3110 annotation_file_found = annotation_file 3111 elif os.path.exists(full_path(annotation_file)): 3112 annotation_file_found = full_path(annotation_file) 3113 else: 3114 # Find within assembly folders 3115 for annotations_database in annotations_databases: 3116 found_files = find_all( 3117 annotation_file, 3118 os.path.join( 3119 annotations_database, assembly 3120 ), 3121 ) 3122 if len(found_files) > 0: 3123 annotation_file_found = found_files[0] 3124 break 3125 if not annotation_file_found and not assembly: 3126 # Find within folders 3127 for ( 3128 annotations_database 3129 ) in annotations_databases: 3130 found_files = find_all( 3131 annotation_file, annotations_database 3132 ) 3133 if len(found_files) > 0: 3134 annotation_file_found = found_files[0] 3135 break 3136 log.debug( 3137 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3138 ) 3139 3140 # Full path 3141 annotation_file_found = full_path(annotation_file_found) 3142 3143 if annotation_file_found: 3144 3145 database = Database(database=annotation_file_found) 3146 quick_annotation_format = database.get_format() 3147 quick_annotation_is_compressed = ( 3148 database.is_compressed() 3149 ) 3150 quick_annotation_is_indexed = os.path.exists( 3151 f"{annotation_file_found}.tbi" 3152 ) 3153 bcftools_preference = False 3154 3155 # Check Annotation Tool 3156 if not annotation_tool: 3157 if ( 3158 bcftools_preference 3159 and quick_annotation_format 3160 in ["vcf", "bed"] 3161 and quick_annotation_is_compressed 3162 and quick_annotation_is_indexed 3163 ): 3164 annotation_tool = "bcftools" 3165 elif quick_annotation_format in [ 3166 "vcf", 3167 "bed", 3168 "tsv", 3169 "tsv", 3170 "csv", 3171 "json", 3172 "tbl", 3173 "parquet", 3174 "duckdb", 3175 ]: 3176 annotation_tool = "parquet" 3177 elif quick_annotation_format in [ 3178 "bw" 3179 ]: 3180 annotation_tool = "bigwig" 3181 else: 3182 log.error( 3183 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3184 ) 3185 raise ValueError( 3186 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3187 ) 3188 3189 log.debug( 3190 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3191 ) 3192 3193 # Annotation Tool dispatch 3194 if annotation_tool: 3195 if annotation_tool not in param["annotation"]: 3196 param["annotation"][annotation_tool] = {} 3197 if ( 3198 "annotations" 3199 not in param["annotation"][annotation_tool] 3200 ): 3201 param["annotation"][annotation_tool][ 3202 "annotations" 3203 ] = {} 3204 param["annotation"][annotation_tool][ 3205 "annotations" 3206 ][annotation_file_found] = annotations 3207 3208 else: 3209 log.warning( 3210 f"Quick Annotation File {annotation_file} does NOT exist" 3211 ) 3212 3213 self.set_param(param) 3214 3215 if param.get("annotation", None): 3216 log.info("Annotations") 3217 if param.get("annotation", {}).get("parquet", None): 3218 log.info("Annotations 'parquet'...") 3219 self.annotation_parquet() 3220 if param.get("annotation", {}).get("bcftools", None): 3221 log.info("Annotations 'bcftools'...") 3222 self.annotation_bcftools() 3223 if param.get("annotation", {}).get("snpsift", None): 3224 log.info("Annotations 'snpsift'...") 3225 self.annotation_snpsift() 3226 if param.get("annotation", {}).get("bigwig", None): 3227 log.info("Annotations 'bigwig'...") 3228 self.annotation_bigwig() 3229 if param.get("annotation", {}).get("annovar", None): 3230 log.info("Annotations 'annovar'...") 3231 self.annotation_annovar() 3232 if param.get("annotation", {}).get("snpeff", None): 3233 log.info("Annotations 'snpeff'...") 3234 self.annotation_snpeff() 3235 if param.get("annotation", {}).get("exomiser", None) is not None: 3236 log.info("Annotations 'exomiser'...") 3237 self.annotation_exomiser() 3238 if param.get("annotation", {}).get("splice", None) is not None: 3239 log.info("Annotations 'splice' ...") 3240 self.annotation_splice() 3241 3242 # Explode INFOS fields into table fields 3243 if self.get_explode_infos(): 3244 self.explode_infos( 3245 prefix=self.get_explode_infos_prefix(), 3246 fields=self.get_explode_infos_fields(), 3247 force=True, 3248 ) 3249 3250 3251 def annotation_bigwig(self, threads: int = None) -> None: 3252 """ 3253 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3254 3255 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3256 number of threads to be used for parallel processing during the annotation process. If the 3257 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3258 threads to use based on the system configuration 3259 :type threads: int 3260 :return: True 3261 """ 3262 3263 # DEBUG 3264 log.debug("Start annotation with bigwig databases") 3265 3266 # # Threads 3267 # if not threads: 3268 # threads = self.get_threads() 3269 # log.debug("Threads: " + str(threads)) 3270 3271 # Config 3272 config = self.get_config() 3273 log.debug("Config: " + str(config)) 3274 3275 # Config - BCFTools databases folders 3276 databases_folders = set( 3277 self.get_config() 3278 .get("folders", {}) 3279 .get("databases", {}) 3280 .get("annotations", ["."]) 3281 + self.get_config() 3282 .get("folders", {}) 3283 .get("databases", {}) 3284 .get("bigwig", ["."]) 3285 ) 3286 log.debug("Databases annotations: " + str(databases_folders)) 3287 3288 # Param 3289 annotations = ( 3290 self.get_param() 3291 .get("annotation", {}) 3292 .get("bigwig", {}) 3293 .get("annotations", None) 3294 ) 3295 log.debug("Annotations: " + str(annotations)) 3296 3297 # Assembly 3298 assembly = self.get_param().get( 3299 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3300 ) 3301 3302 # Data 3303 table_variants = self.get_table_variants() 3304 3305 # Check if not empty 3306 log.debug("Check if not empty") 3307 sql_query_chromosomes = ( 3308 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3309 ) 3310 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3311 if not sql_query_chromosomes_df["count"][0]: 3312 log.info(f"VCF empty") 3313 return 3314 3315 # VCF header 3316 vcf_reader = self.get_header() 3317 log.debug("Initial header: " + str(vcf_reader.infos)) 3318 3319 # Existing annotations 3320 for vcf_annotation in self.get_header().infos: 3321 3322 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3323 log.debug( 3324 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3325 ) 3326 3327 if annotations: 3328 3329 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3330 3331 # Export VCF file 3332 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3333 3334 # annotation_bigwig_config 3335 annotation_bigwig_config_list = [] 3336 3337 for annotation in annotations: 3338 annotation_fields = annotations[annotation] 3339 3340 # Annotation Name 3341 annotation_name = os.path.basename(annotation) 3342 3343 if not annotation_fields: 3344 annotation_fields = {"INFO": None} 3345 3346 log.debug(f"Annotation '{annotation_name}'") 3347 log.debug( 3348 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3349 ) 3350 3351 # Create Database 3352 database = Database( 3353 database=annotation, 3354 databases_folders=databases_folders, 3355 assembly=assembly, 3356 ) 3357 3358 # Find files 3359 db_file = database.get_database() 3360 db_file = full_path(db_file) 3361 db_hdr_file = database.get_header_file() 3362 db_hdr_file = full_path(db_hdr_file) 3363 db_file_type = database.get_format() 3364 3365 # If db_file is http ? 3366 if database.get_database().startswith("http"): 3367 3368 # Datbase is HTTP URL 3369 db_file_is_http = True 3370 3371 # DB file keep as URL 3372 db_file = database.get_database() 3373 log.warning(f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)") 3374 3375 # Retrieve automatic annotation field name 3376 annotation_field = clean_annotation_field(os.path.basename(db_file).replace(".bw", "")) 3377 log.debug(f"Create header file with annotation field '{annotation_field}' is an HTTP URL") 3378 3379 # Create automatic header file 3380 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3381 with open(db_hdr_file, 'w') as f: 3382 f.write("##fileformat=VCFv4.2\n") 3383 f.write(f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""") 3384 f.write(f"#CHROM START END {annotation_field}\n") 3385 3386 else: 3387 3388 # Datbase is NOT HTTP URL 3389 db_file_is_http = False 3390 3391 3392 # Check index - try to create if not exists 3393 if db_file is None or db_hdr_file is None or (not os.path.exists(db_file) and not db_file_is_http) or not os.path.exists(db_hdr_file) or not db_file_type in ["bw"]: 3394 #if False: 3395 log.error("Annotation failed: database not valid") 3396 log.error(f"Annotation annotation file: {db_file}") 3397 log.error(f"Annotation annotation file type: {db_file_type}") 3398 log.error(f"Annotation annotation header: {db_hdr_file}") 3399 raise ValueError( 3400 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3401 ) 3402 else: 3403 3404 # Log 3405 log.debug( 3406 f"Annotation '{annotation}' - file: " 3407 + str(db_file) 3408 + " and " 3409 + str(db_hdr_file) 3410 ) 3411 3412 # Load header as VCF object 3413 db_hdr_vcf = Variants(input=db_hdr_file) 3414 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3415 log.debug( 3416 "Annotation database header: " 3417 + str(db_hdr_vcf_header_infos) 3418 ) 3419 3420 # For all fields in database 3421 annotation_fields_full = False 3422 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3423 annotation_fields = { 3424 key: key for key in db_hdr_vcf_header_infos 3425 } 3426 log.debug( 3427 "Annotation database header - All annotations added: " 3428 + str(annotation_fields) 3429 ) 3430 annotation_fields_full = True 3431 3432 # Init 3433 cyvcf2_header_rename_dict = {} 3434 cyvcf2_header_list = [] 3435 cyvcf2_header_indexes = {} 3436 3437 # process annotation fields 3438 for annotation_field in annotation_fields: 3439 3440 # New annotation name 3441 annotation_field_new = annotation_fields[annotation_field] 3442 3443 # Check annotation field and index in header 3444 if annotation_field in db_hdr_vcf.get_header_columns_as_list(): 3445 annotation_field_index = db_hdr_vcf.get_header_columns_as_list().index(annotation_field)-3 3446 cyvcf2_header_indexes[annotation_field_new] = annotation_field_index 3447 else: 3448 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3449 log.error(msg_err) 3450 raise ValueError(msg_err) 3451 3452 # Append annotation field in cyvcf2 header list 3453 cyvcf2_header_rename_dict[annotation_field_new] = db_hdr_vcf_header_infos[annotation_field].id 3454 cyvcf2_header_list.append( 3455 { 3456 "ID": annotation_field_new, 3457 "Number": db_hdr_vcf_header_infos[annotation_field].num, 3458 "Type": db_hdr_vcf_header_infos[annotation_field].type, 3459 "Description": db_hdr_vcf_header_infos[annotation_field].desc, 3460 } 3461 ) 3462 3463 # Load bigwig database 3464 bw_db = pyBigWig.open(db_file) 3465 if bw_db.isBigWig(): 3466 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3467 else: 3468 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3469 log.error(msg_err) 3470 raise ValueError(msg_err) 3471 3472 annotation_bigwig_config_list.append( 3473 { 3474 "db_file": db_file, 3475 "bw_db": bw_db, 3476 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3477 "cyvcf2_header_list": cyvcf2_header_list, 3478 "cyvcf2_header_indexes": cyvcf2_header_indexes 3479 } 3480 ) 3481 3482 # Annotate 3483 if annotation_bigwig_config_list: 3484 3485 # Annotation config 3486 log.debug(f"annotation_bigwig_config={annotation_bigwig_config_list}") 3487 3488 # Export VCF file 3489 self.export_variant_vcf( 3490 vcf_file=tmp_vcf_name, 3491 remove_info=True, 3492 add_samples=False, 3493 index=True, 3494 ) 3495 3496 # Load input tmp file 3497 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3498 3499 # Add header in input file 3500 for annotation_bigwig_config in annotation_bigwig_config_list: 3501 for cyvcf2_header_field in annotation_bigwig_config.get("cyvcf2_header_list",[]): 3502 log.info(f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'") 3503 input_vcf.add_info_to_header( 3504 cyvcf2_header_field 3505 ) 3506 3507 # Create output VCF file 3508 output_vcf_file = os.path.join(tmp_dir,"output.vcf.gz") 3509 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3510 3511 # Fetch variants 3512 log.info(f"Annotations 'bigwig' start...") 3513 for variant in input_vcf: 3514 3515 for annotation_bigwig_config in annotation_bigwig_config_list: 3516 3517 # DB and indexes 3518 bw_db = annotation_bigwig_config.get("bw_db", None) 3519 cyvcf2_header_indexes = annotation_bigwig_config.get("cyvcf2_header_indexes", None) 3520 3521 # Retrieve value from chrom pos 3522 res = bw_db.values(variant.CHROM, variant.POS - 1, variant.POS) 3523 3524 # For each annotation fields (and indexes) 3525 for cyvcf2_header_index in cyvcf2_header_indexes: 3526 3527 # If value is NOT nNone 3528 if not np.isnan(res[cyvcf2_header_indexes[cyvcf2_header_index]]): 3529 variant.INFO[cyvcf2_header_index] = res[cyvcf2_header_indexes[cyvcf2_header_index]] 3530 3531 # Add record in output file 3532 output_vcf.write_record(variant) 3533 3534 # Log 3535 log.debug(f"Annotation done.") 3536 3537 # Close and write file 3538 log.info(f"Annotations 'bigwig' write...") 3539 output_vcf.close() 3540 log.debug(f"Write done.") 3541 3542 # Update variants 3543 log.info(f"Annotations 'bigwig' update...") 3544 self.update_from_vcf(output_vcf_file) 3545 log.debug(f"Update done.") 3546 3547 return True 3548 3549 3550 def annotation_snpsift(self, threads: int = None) -> None: 3551 """ 3552 This function annotate with bcftools 3553 3554 :param threads: Number of threads to use 3555 :return: the value of the variable "return_value". 3556 """ 3557 3558 # DEBUG 3559 log.debug("Start annotation with bcftools databases") 3560 3561 # Threads 3562 if not threads: 3563 threads = self.get_threads() 3564 log.debug("Threads: " + str(threads)) 3565 3566 # Config 3567 config = self.get_config() 3568 log.debug("Config: " + str(config)) 3569 3570 # Config - snpSift 3571 snpsift_bin_command = get_bin_command( 3572 bin="SnpSift.jar", 3573 tool="snpsift", 3574 bin_type="jar", 3575 config=config, 3576 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3577 ) 3578 if not snpsift_bin_command: 3579 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3580 log.error(msg_err) 3581 raise ValueError(msg_err) 3582 3583 # Config - bcftools 3584 bcftools_bin_command = get_bin_command( 3585 bin="bcftools", 3586 tool="bcftools", 3587 bin_type="bin", 3588 config=config, 3589 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3590 ) 3591 if not bcftools_bin_command: 3592 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3593 log.error(msg_err) 3594 raise ValueError(msg_err) 3595 3596 # Config - BCFTools databases folders 3597 databases_folders = set( 3598 self.get_config() 3599 .get("folders", {}) 3600 .get("databases", {}) 3601 .get("annotations", ["."]) 3602 + self.get_config() 3603 .get("folders", {}) 3604 .get("databases", {}) 3605 .get("bcftools", ["."]) 3606 ) 3607 log.debug("Databases annotations: " + str(databases_folders)) 3608 3609 # Param 3610 annotations = ( 3611 self.get_param() 3612 .get("annotation", {}) 3613 .get("snpsift", {}) 3614 .get("annotations", None) 3615 ) 3616 log.debug("Annotations: " + str(annotations)) 3617 3618 # Assembly 3619 assembly = self.get_param().get( 3620 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3621 ) 3622 3623 # Data 3624 table_variants = self.get_table_variants() 3625 3626 # Check if not empty 3627 log.debug("Check if not empty") 3628 sql_query_chromosomes = ( 3629 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3630 ) 3631 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3632 if not sql_query_chromosomes_df["count"][0]: 3633 log.info(f"VCF empty") 3634 return 3635 3636 # VCF header 3637 vcf_reader = self.get_header() 3638 log.debug("Initial header: " + str(vcf_reader.infos)) 3639 3640 # Existing annotations 3641 for vcf_annotation in self.get_header().infos: 3642 3643 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3644 log.debug( 3645 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3646 ) 3647 3648 if annotations: 3649 3650 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3651 3652 # Export VCF file 3653 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3654 3655 # Init 3656 commands = {} 3657 3658 for annotation in annotations: 3659 annotation_fields = annotations[annotation] 3660 3661 # Annotation Name 3662 annotation_name = os.path.basename(annotation) 3663 3664 if not annotation_fields: 3665 annotation_fields = {"INFO": None} 3666 3667 log.debug(f"Annotation '{annotation_name}'") 3668 log.debug( 3669 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3670 ) 3671 3672 # Create Database 3673 database = Database( 3674 database=annotation, 3675 databases_folders=databases_folders, 3676 assembly=assembly, 3677 ) 3678 3679 # Find files 3680 db_file = database.get_database() 3681 db_file = full_path(db_file) 3682 db_hdr_file = database.get_header_file() 3683 db_hdr_file = full_path(db_hdr_file) 3684 db_file_type = database.get_format() 3685 db_tbi_file = f"{db_file}.tbi" 3686 db_file_compressed = database.is_compressed() 3687 3688 # Check if compressed 3689 if not db_file_compressed: 3690 log.error( 3691 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3692 ) 3693 raise ValueError( 3694 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3695 ) 3696 3697 # Check if indexed 3698 if not os.path.exists(db_tbi_file): 3699 log.error( 3700 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3701 ) 3702 raise ValueError( 3703 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3704 ) 3705 3706 # Check index - try to create if not exists 3707 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3708 log.error("Annotation failed: database not valid") 3709 log.error(f"Annotation annotation file: {db_file}") 3710 log.error(f"Annotation annotation header: {db_hdr_file}") 3711 log.error(f"Annotation annotation index: {db_tbi_file}") 3712 raise ValueError( 3713 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3714 ) 3715 else: 3716 3717 log.debug( 3718 f"Annotation '{annotation}' - file: " 3719 + str(db_file) 3720 + " and " 3721 + str(db_hdr_file) 3722 ) 3723 3724 # Load header as VCF object 3725 db_hdr_vcf = Variants(input=db_hdr_file) 3726 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3727 log.debug( 3728 "Annotation database header: " 3729 + str(db_hdr_vcf_header_infos) 3730 ) 3731 3732 # For all fields in database 3733 annotation_fields_full = False 3734 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3735 annotation_fields = { 3736 key: key for key in db_hdr_vcf_header_infos 3737 } 3738 log.debug( 3739 "Annotation database header - All annotations added: " 3740 + str(annotation_fields) 3741 ) 3742 annotation_fields_full = True 3743 3744 # # Create file for field rename 3745 # log.debug("Create file for field rename") 3746 # tmp_rename = NamedTemporaryFile( 3747 # prefix=self.get_prefix(), 3748 # dir=self.get_tmp_dir(), 3749 # suffix=".rename", 3750 # delete=False, 3751 # ) 3752 # tmp_rename_name = tmp_rename.name 3753 # tmp_files.append(tmp_rename_name) 3754 3755 # Number of fields 3756 nb_annotation_field = 0 3757 annotation_list = [] 3758 annotation_infos_rename_list = [] 3759 3760 for annotation_field in annotation_fields: 3761 3762 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3763 annotation_fields_new_name = annotation_fields.get( 3764 annotation_field, annotation_field 3765 ) 3766 if not annotation_fields_new_name: 3767 annotation_fields_new_name = annotation_field 3768 3769 # Check if field is in DB and if field is not elready in input data 3770 if ( 3771 annotation_field in db_hdr_vcf.get_header().infos 3772 and annotation_fields_new_name 3773 not in self.get_header().infos 3774 ): 3775 3776 log.info( 3777 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3778 ) 3779 3780 # BCFTools annotate param to rename fields 3781 if annotation_field != annotation_fields_new_name: 3782 annotation_infos_rename_list.append( 3783 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3784 ) 3785 3786 # Add INFO field to header 3787 db_hdr_vcf_header_infos_number = ( 3788 db_hdr_vcf_header_infos[annotation_field].num or "." 3789 ) 3790 db_hdr_vcf_header_infos_type = ( 3791 db_hdr_vcf_header_infos[annotation_field].type 3792 or "String" 3793 ) 3794 db_hdr_vcf_header_infos_description = ( 3795 db_hdr_vcf_header_infos[annotation_field].desc 3796 or f"{annotation_field} description" 3797 ) 3798 db_hdr_vcf_header_infos_source = ( 3799 db_hdr_vcf_header_infos[annotation_field].source 3800 or "unknown" 3801 ) 3802 db_hdr_vcf_header_infos_version = ( 3803 db_hdr_vcf_header_infos[annotation_field].version 3804 or "unknown" 3805 ) 3806 3807 vcf_reader.infos[annotation_fields_new_name] = ( 3808 vcf.parser._Info( 3809 annotation_fields_new_name, 3810 db_hdr_vcf_header_infos_number, 3811 db_hdr_vcf_header_infos_type, 3812 db_hdr_vcf_header_infos_description, 3813 db_hdr_vcf_header_infos_source, 3814 db_hdr_vcf_header_infos_version, 3815 self.code_type_map[ 3816 db_hdr_vcf_header_infos_type 3817 ], 3818 ) 3819 ) 3820 3821 annotation_list.append(annotation_field) 3822 3823 nb_annotation_field += 1 3824 3825 else: 3826 3827 if ( 3828 annotation_field 3829 not in db_hdr_vcf.get_header().infos 3830 ): 3831 log.warning( 3832 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3833 ) 3834 if ( 3835 annotation_fields_new_name 3836 in self.get_header().infos 3837 ): 3838 log.warning( 3839 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3840 ) 3841 3842 log.info( 3843 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3844 ) 3845 3846 annotation_infos = ",".join(annotation_list) 3847 3848 if annotation_infos != "": 3849 3850 # Annotated VCF (and error file) 3851 tmp_annotation_vcf_name = os.path.join( 3852 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3853 ) 3854 tmp_annotation_vcf_name_err = ( 3855 tmp_annotation_vcf_name + ".err" 3856 ) 3857 3858 # Add fields to annotate 3859 if not annotation_fields_full: 3860 annotation_infos_option = f"-info {annotation_infos}" 3861 else: 3862 annotation_infos_option = "" 3863 3864 # Info fields rename 3865 if annotation_infos_rename_list: 3866 annotation_infos_rename = " -c " + ",".join( 3867 annotation_infos_rename_list 3868 ) 3869 else: 3870 annotation_infos_rename = "" 3871 3872 # Annotate command 3873 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3874 3875 # Add command 3876 commands[command_annotate] = tmp_annotation_vcf_name 3877 3878 if commands: 3879 3880 # Export VCF file 3881 self.export_variant_vcf( 3882 vcf_file=tmp_vcf_name, 3883 remove_info=True, 3884 add_samples=False, 3885 index=True, 3886 ) 3887 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3888 3889 # Num command 3890 nb_command = 0 3891 3892 # Annotate 3893 for command_annotate in commands: 3894 nb_command += 1 3895 log.info( 3896 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3897 ) 3898 log.debug(f"command_annotate={command_annotate}") 3899 run_parallel_commands([command_annotate], threads) 3900 3901 # Debug 3902 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3903 3904 # Update variants 3905 log.info( 3906 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3907 ) 3908 self.update_from_vcf(commands[command_annotate]) 3909 3910 3911 def annotation_bcftools(self, threads: int = None) -> None: 3912 """ 3913 This function annotate with bcftools 3914 3915 :param threads: Number of threads to use 3916 :return: the value of the variable "return_value". 3917 """ 3918 3919 # DEBUG 3920 log.debug("Start annotation with bcftools databases") 3921 3922 # Threads 3923 if not threads: 3924 threads = self.get_threads() 3925 log.debug("Threads: " + str(threads)) 3926 3927 # Config 3928 config = self.get_config() 3929 log.debug("Config: " + str(config)) 3930 3931 # DEBUG 3932 delete_tmp = True 3933 if self.get_config().get("verbosity", "warning") in ["debug"]: 3934 delete_tmp = False 3935 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3936 3937 # Config - BCFTools bin command 3938 bcftools_bin_command = get_bin_command( 3939 bin="bcftools", 3940 tool="bcftools", 3941 bin_type="bin", 3942 config=config, 3943 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3944 ) 3945 if not bcftools_bin_command: 3946 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3947 log.error(msg_err) 3948 raise ValueError(msg_err) 3949 3950 # Config - BCFTools databases folders 3951 databases_folders = set( 3952 self.get_config() 3953 .get("folders", {}) 3954 .get("databases", {}) 3955 .get("annotations", ["."]) 3956 + self.get_config() 3957 .get("folders", {}) 3958 .get("databases", {}) 3959 .get("bcftools", ["."]) 3960 ) 3961 log.debug("Databases annotations: " + str(databases_folders)) 3962 3963 # Param 3964 annotations = ( 3965 self.get_param() 3966 .get("annotation", {}) 3967 .get("bcftools", {}) 3968 .get("annotations", None) 3969 ) 3970 log.debug("Annotations: " + str(annotations)) 3971 3972 # Assembly 3973 assembly = self.get_param().get( 3974 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3975 ) 3976 3977 # Data 3978 table_variants = self.get_table_variants() 3979 3980 # Check if not empty 3981 log.debug("Check if not empty") 3982 sql_query_chromosomes = ( 3983 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3984 ) 3985 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3986 if not sql_query_chromosomes_df["count"][0]: 3987 log.info(f"VCF empty") 3988 return 3989 3990 # Export in VCF 3991 log.debug("Create initial file to annotate") 3992 tmp_vcf = NamedTemporaryFile( 3993 prefix=self.get_prefix(), 3994 dir=self.get_tmp_dir(), 3995 suffix=".vcf.gz", 3996 delete=False, 3997 ) 3998 tmp_vcf_name = tmp_vcf.name 3999 4000 # VCF header 4001 vcf_reader = self.get_header() 4002 log.debug("Initial header: " + str(vcf_reader.infos)) 4003 4004 # Existing annotations 4005 for vcf_annotation in self.get_header().infos: 4006 4007 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4008 log.debug( 4009 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4010 ) 4011 4012 if annotations: 4013 4014 tmp_ann_vcf_list = [] 4015 commands = [] 4016 tmp_files = [] 4017 err_files = [] 4018 4019 for annotation in annotations: 4020 annotation_fields = annotations[annotation] 4021 4022 # Annotation Name 4023 annotation_name = os.path.basename(annotation) 4024 4025 if not annotation_fields: 4026 annotation_fields = {"INFO": None} 4027 4028 log.debug(f"Annotation '{annotation_name}'") 4029 log.debug( 4030 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4031 ) 4032 4033 # Create Database 4034 database = Database( 4035 database=annotation, 4036 databases_folders=databases_folders, 4037 assembly=assembly, 4038 ) 4039 4040 # Find files 4041 db_file = database.get_database() 4042 db_file = full_path(db_file) 4043 db_hdr_file = database.get_header_file() 4044 db_hdr_file = full_path(db_hdr_file) 4045 db_file_type = database.get_format() 4046 db_tbi_file = f"{db_file}.tbi" 4047 db_file_compressed = database.is_compressed() 4048 4049 # Check if compressed 4050 if not db_file_compressed: 4051 log.error( 4052 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4053 ) 4054 raise ValueError( 4055 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4056 ) 4057 4058 # Check if indexed 4059 if not os.path.exists(db_tbi_file): 4060 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4061 raise ValueError( 4062 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4063 ) 4064 4065 # Check index - try to create if not exists 4066 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4067 log.error("Annotation failed: database not valid") 4068 log.error(f"Annotation annotation file: {db_file}") 4069 log.error(f"Annotation annotation header: {db_hdr_file}") 4070 log.error(f"Annotation annotation index: {db_tbi_file}") 4071 raise ValueError( 4072 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4073 ) 4074 else: 4075 4076 log.debug( 4077 f"Annotation '{annotation}' - file: " 4078 + str(db_file) 4079 + " and " 4080 + str(db_hdr_file) 4081 ) 4082 4083 # Load header as VCF object 4084 db_hdr_vcf = Variants(input=db_hdr_file) 4085 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4086 log.debug( 4087 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4088 ) 4089 4090 # For all fields in database 4091 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4092 annotation_fields = { 4093 key: key for key in db_hdr_vcf_header_infos 4094 } 4095 log.debug( 4096 "Annotation database header - All annotations added: " 4097 + str(annotation_fields) 4098 ) 4099 4100 # Number of fields 4101 nb_annotation_field = 0 4102 annotation_list = [] 4103 4104 for annotation_field in annotation_fields: 4105 4106 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4107 annotation_fields_new_name = annotation_fields.get( 4108 annotation_field, annotation_field 4109 ) 4110 if not annotation_fields_new_name: 4111 annotation_fields_new_name = annotation_field 4112 4113 # Check if field is in DB and if field is not elready in input data 4114 if ( 4115 annotation_field in db_hdr_vcf.get_header().infos 4116 and annotation_fields_new_name 4117 not in self.get_header().infos 4118 ): 4119 4120 log.info( 4121 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4122 ) 4123 4124 # Add INFO field to header 4125 db_hdr_vcf_header_infos_number = ( 4126 db_hdr_vcf_header_infos[annotation_field].num or "." 4127 ) 4128 db_hdr_vcf_header_infos_type = ( 4129 db_hdr_vcf_header_infos[annotation_field].type 4130 or "String" 4131 ) 4132 db_hdr_vcf_header_infos_description = ( 4133 db_hdr_vcf_header_infos[annotation_field].desc 4134 or f"{annotation_field} description" 4135 ) 4136 db_hdr_vcf_header_infos_source = ( 4137 db_hdr_vcf_header_infos[annotation_field].source 4138 or "unknown" 4139 ) 4140 db_hdr_vcf_header_infos_version = ( 4141 db_hdr_vcf_header_infos[annotation_field].version 4142 or "unknown" 4143 ) 4144 4145 vcf_reader.infos[annotation_fields_new_name] = ( 4146 vcf.parser._Info( 4147 annotation_fields_new_name, 4148 db_hdr_vcf_header_infos_number, 4149 db_hdr_vcf_header_infos_type, 4150 db_hdr_vcf_header_infos_description, 4151 db_hdr_vcf_header_infos_source, 4152 db_hdr_vcf_header_infos_version, 4153 self.code_type_map[db_hdr_vcf_header_infos_type], 4154 ) 4155 ) 4156 4157 # annotation_list.append(annotation_field) 4158 if annotation_field != annotation_fields_new_name: 4159 annotation_list.append( 4160 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4161 ) 4162 else: 4163 annotation_list.append(annotation_field) 4164 4165 nb_annotation_field += 1 4166 4167 else: 4168 4169 if annotation_field not in db_hdr_vcf.get_header().infos: 4170 log.warning( 4171 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4172 ) 4173 if annotation_fields_new_name in self.get_header().infos: 4174 log.warning( 4175 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4176 ) 4177 4178 log.info( 4179 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4180 ) 4181 4182 annotation_infos = ",".join(annotation_list) 4183 4184 if annotation_infos != "": 4185 4186 # Protect header for bcftools (remove "#CHROM" and variants line) 4187 log.debug("Protect Header file - remove #CHROM line if exists") 4188 tmp_header_vcf = NamedTemporaryFile( 4189 prefix=self.get_prefix(), 4190 dir=self.get_tmp_dir(), 4191 suffix=".hdr", 4192 delete=False, 4193 ) 4194 tmp_header_vcf_name = tmp_header_vcf.name 4195 tmp_files.append(tmp_header_vcf_name) 4196 # Command 4197 if db_hdr_file.endswith(".gz"): 4198 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4199 else: 4200 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4201 # Run 4202 run_parallel_commands([command_extract_header], 1) 4203 4204 # Find chomosomes 4205 log.debug("Find chromosomes ") 4206 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4207 sql_query_chromosomes_df = self.get_query_to_df( 4208 sql_query_chromosomes 4209 ) 4210 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4211 4212 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4213 4214 # BED columns in the annotation file 4215 if db_file_type in ["bed"]: 4216 annotation_infos = "CHROM,POS,POS," + annotation_infos 4217 4218 for chrom in chomosomes_list: 4219 4220 # Create BED on initial VCF 4221 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4222 tmp_bed = NamedTemporaryFile( 4223 prefix=self.get_prefix(), 4224 dir=self.get_tmp_dir(), 4225 suffix=".bed", 4226 delete=False, 4227 ) 4228 tmp_bed_name = tmp_bed.name 4229 tmp_files.append(tmp_bed_name) 4230 4231 # Detecte regions 4232 log.debug( 4233 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4234 ) 4235 window = 1000000 4236 sql_query_intervals_for_bed = f""" 4237 SELECT \"#CHROM\", 4238 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4239 \"POS\"+{window} 4240 FROM {table_variants} as table_variants 4241 WHERE table_variants.\"#CHROM\" = '{chrom}' 4242 """ 4243 regions = self.conn.execute( 4244 sql_query_intervals_for_bed 4245 ).fetchall() 4246 merged_regions = merge_regions(regions) 4247 log.debug( 4248 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4249 ) 4250 4251 header = ["#CHROM", "START", "END"] 4252 with open(tmp_bed_name, "w") as f: 4253 # Write the header with tab delimiter 4254 f.write("\t".join(header) + "\n") 4255 for d in merged_regions: 4256 # Write each data row with tab delimiter 4257 f.write("\t".join(map(str, d)) + "\n") 4258 4259 # Tmp files 4260 tmp_annotation_vcf = NamedTemporaryFile( 4261 prefix=self.get_prefix(), 4262 dir=self.get_tmp_dir(), 4263 suffix=".vcf.gz", 4264 delete=False, 4265 ) 4266 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4267 tmp_files.append(tmp_annotation_vcf_name) 4268 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4269 tmp_annotation_vcf_name_err = ( 4270 tmp_annotation_vcf_name + ".err" 4271 ) 4272 err_files.append(tmp_annotation_vcf_name_err) 4273 4274 # Annotate Command 4275 log.debug( 4276 f"Annotation '{annotation}' - add bcftools command" 4277 ) 4278 4279 # Command 4280 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4281 4282 # Add command 4283 commands.append(command_annotate) 4284 4285 # if some commands 4286 if commands: 4287 4288 # Export VCF file 4289 self.export_variant_vcf( 4290 vcf_file=tmp_vcf_name, 4291 remove_info=True, 4292 add_samples=False, 4293 index=True, 4294 ) 4295 4296 # Threads 4297 # calculate threads for annotated commands 4298 if commands: 4299 threads_bcftools_annotate = round(threads / len(commands)) 4300 else: 4301 threads_bcftools_annotate = 1 4302 4303 if not threads_bcftools_annotate: 4304 threads_bcftools_annotate = 1 4305 4306 # Add threads option to bcftools commands 4307 if threads_bcftools_annotate > 1: 4308 commands_threaded = [] 4309 for command in commands: 4310 commands_threaded.append( 4311 command.replace( 4312 f"{bcftools_bin_command} annotate ", 4313 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4314 ) 4315 ) 4316 commands = commands_threaded 4317 4318 # Command annotation multithreading 4319 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4320 log.info( 4321 f"Annotation - Annotation multithreaded in " 4322 + str(len(commands)) 4323 + " commands" 4324 ) 4325 4326 run_parallel_commands(commands, threads) 4327 4328 # Merge 4329 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4330 4331 if tmp_ann_vcf_list_cmd: 4332 4333 # Tmp file 4334 tmp_annotate_vcf = NamedTemporaryFile( 4335 prefix=self.get_prefix(), 4336 dir=self.get_tmp_dir(), 4337 suffix=".vcf.gz", 4338 delete=True, 4339 ) 4340 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4341 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4342 err_files.append(tmp_annotate_vcf_name_err) 4343 4344 # Tmp file remove command 4345 tmp_files_remove_command = "" 4346 if tmp_files: 4347 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4348 4349 # Command merge 4350 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4351 log.info( 4352 f"Annotation - Annotation merging " 4353 + str(len(commands)) 4354 + " annotated files" 4355 ) 4356 log.debug(f"Annotation - merge command: {merge_command}") 4357 run_parallel_commands([merge_command], 1) 4358 4359 # Error messages 4360 log.info(f"Error/Warning messages:") 4361 error_message_command_all = [] 4362 error_message_command_warning = [] 4363 error_message_command_err = [] 4364 for err_file in err_files: 4365 with open(err_file, "r") as f: 4366 for line in f: 4367 message = line.strip() 4368 error_message_command_all.append(message) 4369 if line.startswith("[W::"): 4370 error_message_command_warning.append(message) 4371 if line.startswith("[E::"): 4372 error_message_command_err.append( 4373 f"{err_file}: " + message 4374 ) 4375 # log info 4376 for message in list( 4377 set(error_message_command_err + error_message_command_warning) 4378 ): 4379 log.info(f" {message}") 4380 # debug info 4381 for message in list(set(error_message_command_all)): 4382 log.debug(f" {message}") 4383 # failed 4384 if len(error_message_command_err): 4385 log.error("Annotation failed: Error in commands") 4386 raise ValueError("Annotation failed: Error in commands") 4387 4388 # Update variants 4389 log.info(f"Annotation - Updating...") 4390 self.update_from_vcf(tmp_annotate_vcf_name) 4391 4392 def annotation_exomiser(self, threads: int = None) -> None: 4393 """ 4394 This function annotate with Exomiser 4395 4396 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4397 - "analysis" (dict/file): 4398 Full analysis dictionnary parameters (see Exomiser docs). 4399 Either a dict, or a file in JSON or YAML format. 4400 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4401 Default : None 4402 - "preset" (string): 4403 Analysis preset (available in config folder). 4404 Used if no full "analysis" is provided. 4405 Default: "exome" 4406 - "phenopacket" (dict/file): 4407 Samples and phenotipic features parameters (see Exomiser docs). 4408 Either a dict, or a file in JSON or YAML format. 4409 Default: None 4410 - "subject" (dict): 4411 Sample parameters (see Exomiser docs). 4412 Example: 4413 "subject": 4414 { 4415 "id": "ISDBM322017", 4416 "sex": "FEMALE" 4417 } 4418 Default: None 4419 - "sample" (string): 4420 Sample name to construct "subject" section: 4421 "subject": 4422 { 4423 "id": "<sample>", 4424 "sex": "UNKNOWN_SEX" 4425 } 4426 Default: None 4427 - "phenotypicFeatures" (dict) 4428 Phenotypic features to construct "subject" section. 4429 Example: 4430 "phenotypicFeatures": 4431 [ 4432 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4433 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4434 ] 4435 - "hpo" (list) 4436 List of HPO ids as phenotypic features. 4437 Example: 4438 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4439 Default: [] 4440 - "outputOptions" (dict): 4441 Output options (see Exomiser docs). 4442 Default: 4443 "output_options" = 4444 { 4445 "outputContributingVariantsOnly": False, 4446 "numGenes": 0, 4447 "outputFormats": ["TSV_VARIANT", "VCF"] 4448 } 4449 - "transcript_source" (string): 4450 Transcript source (either "refseq", "ucsc", "ensembl") 4451 Default: "refseq" 4452 - "exomiser_to_info" (boolean): 4453 Add exomiser TSV file columns as INFO fields in VCF. 4454 Default: False 4455 - "release" (string): 4456 Exomise database release. 4457 If not exists, database release will be downloaded (take a while). 4458 Default: None (provided by application.properties configuration file) 4459 - "exomiser_application_properties" (file): 4460 Exomiser configuration file (see Exomiser docs). 4461 Useful to automatically download databases (especially for specific genome databases). 4462 4463 Notes: 4464 - If no sample in parameters, first sample in VCF will be chosen 4465 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4466 4467 :param threads: The number of threads to use 4468 :return: None. 4469 """ 4470 4471 # DEBUG 4472 log.debug("Start annotation with Exomiser databases") 4473 4474 # Threads 4475 if not threads: 4476 threads = self.get_threads() 4477 log.debug("Threads: " + str(threads)) 4478 4479 # Config 4480 config = self.get_config() 4481 log.debug("Config: " + str(config)) 4482 4483 # Config - Folders - Databases 4484 databases_folders = ( 4485 config.get("folders", {}) 4486 .get("databases", {}) 4487 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4488 ) 4489 databases_folders = full_path(databases_folders) 4490 if not os.path.exists(databases_folders): 4491 log.error(f"Databases annotations: {databases_folders} NOT found") 4492 log.debug("Databases annotations: " + str(databases_folders)) 4493 4494 # Config - Exomiser 4495 exomiser_bin_command = get_bin_command( 4496 bin="exomiser-cli*.jar", 4497 tool="exomiser", 4498 bin_type="jar", 4499 config=config, 4500 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4501 ) 4502 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4503 if not exomiser_bin_command: 4504 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4505 log.error(msg_err) 4506 raise ValueError(msg_err) 4507 4508 # Param 4509 param = self.get_param() 4510 log.debug("Param: " + str(param)) 4511 4512 # Param - Exomiser 4513 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4514 log.debug(f"Param Exomiser: {param_exomiser}") 4515 4516 # Param - Assembly 4517 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4518 log.debug("Assembly: " + str(assembly)) 4519 4520 # Data 4521 table_variants = self.get_table_variants() 4522 4523 # Check if not empty 4524 log.debug("Check if not empty") 4525 sql_query_chromosomes = ( 4526 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4527 ) 4528 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4529 log.info(f"VCF empty") 4530 return False 4531 4532 # VCF header 4533 vcf_reader = self.get_header() 4534 log.debug("Initial header: " + str(vcf_reader.infos)) 4535 4536 # Samples 4537 samples = self.get_header_sample_list() 4538 if not samples: 4539 log.error("No Samples in VCF") 4540 return False 4541 log.debug(f"Samples: {samples}") 4542 4543 # Memory limit 4544 memory_limit = self.get_memory("8G") 4545 log.debug(f"memory_limit: {memory_limit}") 4546 4547 # Exomiser java options 4548 exomiser_java_options = ( 4549 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4550 ) 4551 log.debug(f"Exomiser java options: {exomiser_java_options}") 4552 4553 # Download Exomiser (if not exists) 4554 exomiser_release = param_exomiser.get("release", None) 4555 exomiser_application_properties = param_exomiser.get( 4556 "exomiser_application_properties", None 4557 ) 4558 databases_download_exomiser( 4559 assemblies=[assembly], 4560 exomiser_folder=databases_folders, 4561 exomiser_release=exomiser_release, 4562 exomiser_phenotype_release=exomiser_release, 4563 exomiser_application_properties=exomiser_application_properties, 4564 ) 4565 4566 # Force annotation 4567 force_update_annotation = True 4568 4569 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4570 log.debug("Start annotation Exomiser") 4571 4572 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4573 4574 # tmp_dir = "/tmp/exomiser" 4575 4576 ### ANALYSIS ### 4577 ################ 4578 4579 # Create analysis.json through analysis dict 4580 # either analysis in param or by default 4581 # depending on preset exome/genome) 4582 4583 # Init analysis dict 4584 param_exomiser_analysis_dict = {} 4585 4586 # analysis from param 4587 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4588 param_exomiser_analysis = full_path(param_exomiser_analysis) 4589 4590 # If analysis in param -> load anlaysis json 4591 if param_exomiser_analysis: 4592 4593 # If param analysis is a file and exists 4594 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4595 param_exomiser_analysis 4596 ): 4597 # Load analysis file into analysis dict (either yaml or json) 4598 with open(param_exomiser_analysis) as json_file: 4599 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4600 4601 # If param analysis is a dict 4602 elif isinstance(param_exomiser_analysis, dict): 4603 # Load analysis dict into analysis dict (either yaml or json) 4604 param_exomiser_analysis_dict = param_exomiser_analysis 4605 4606 # Error analysis type 4607 else: 4608 log.error(f"Analysis type unknown. Check param file.") 4609 raise ValueError(f"Analysis type unknown. Check param file.") 4610 4611 # Case no input analysis config file/dict 4612 # Use preset (exome/genome) to open default config file 4613 if not param_exomiser_analysis_dict: 4614 4615 # default preset 4616 default_preset = "exome" 4617 4618 # Get param preset or default preset 4619 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4620 4621 # Try to find if preset is a file 4622 if os.path.exists(param_exomiser_preset): 4623 # Preset file is provided in full path 4624 param_exomiser_analysis_default_config_file = ( 4625 param_exomiser_preset 4626 ) 4627 # elif os.path.exists(full_path(param_exomiser_preset)): 4628 # # Preset file is provided in full path 4629 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4630 elif os.path.exists( 4631 os.path.join(folder_config, param_exomiser_preset) 4632 ): 4633 # Preset file is provided a basename in config folder (can be a path with subfolders) 4634 param_exomiser_analysis_default_config_file = os.path.join( 4635 folder_config, param_exomiser_preset 4636 ) 4637 else: 4638 # Construct preset file 4639 param_exomiser_analysis_default_config_file = os.path.join( 4640 folder_config, 4641 f"preset-{param_exomiser_preset}-analysis.json", 4642 ) 4643 4644 # If preset file exists 4645 param_exomiser_analysis_default_config_file = full_path( 4646 param_exomiser_analysis_default_config_file 4647 ) 4648 if os.path.exists(param_exomiser_analysis_default_config_file): 4649 # Load prest file into analysis dict (either yaml or json) 4650 with open( 4651 param_exomiser_analysis_default_config_file 4652 ) as json_file: 4653 # param_exomiser_analysis_dict[""] = json.load(json_file) 4654 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4655 json_file 4656 ) 4657 4658 # Error preset file 4659 else: 4660 log.error( 4661 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4662 ) 4663 raise ValueError( 4664 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4665 ) 4666 4667 # If no analysis dict created 4668 if not param_exomiser_analysis_dict: 4669 log.error(f"No analysis config") 4670 raise ValueError(f"No analysis config") 4671 4672 # Log 4673 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4674 4675 ### PHENOPACKET ### 4676 ################### 4677 4678 # If no PhenoPacket in analysis dict -> check in param 4679 if "phenopacket" not in param_exomiser_analysis_dict: 4680 4681 # If PhenoPacket in param -> load anlaysis json 4682 if param_exomiser.get("phenopacket", None): 4683 4684 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4685 param_exomiser_phenopacket = full_path( 4686 param_exomiser_phenopacket 4687 ) 4688 4689 # If param phenopacket is a file and exists 4690 if isinstance( 4691 param_exomiser_phenopacket, str 4692 ) and os.path.exists(param_exomiser_phenopacket): 4693 # Load phenopacket file into analysis dict (either yaml or json) 4694 with open(param_exomiser_phenopacket) as json_file: 4695 param_exomiser_analysis_dict["phenopacket"] = ( 4696 yaml.safe_load(json_file) 4697 ) 4698 4699 # If param phenopacket is a dict 4700 elif isinstance(param_exomiser_phenopacket, dict): 4701 # Load phenopacket dict into analysis dict (either yaml or json) 4702 param_exomiser_analysis_dict["phenopacket"] = ( 4703 param_exomiser_phenopacket 4704 ) 4705 4706 # Error phenopacket type 4707 else: 4708 log.error(f"Phenopacket type unknown. Check param file.") 4709 raise ValueError( 4710 f"Phenopacket type unknown. Check param file." 4711 ) 4712 4713 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4714 if "phenopacket" not in param_exomiser_analysis_dict: 4715 4716 # Init PhenoPacket 4717 param_exomiser_analysis_dict["phenopacket"] = { 4718 "id": "analysis", 4719 "proband": {}, 4720 } 4721 4722 ### Add subject ### 4723 4724 # If subject exists 4725 param_exomiser_subject = param_exomiser.get("subject", {}) 4726 4727 # If subject not exists -> found sample ID 4728 if not param_exomiser_subject: 4729 4730 # Found sample ID in param 4731 sample = param_exomiser.get("sample", None) 4732 4733 # Find sample ID (first sample) 4734 if not sample: 4735 sample_list = self.get_header_sample_list() 4736 if len(sample_list) > 0: 4737 sample = sample_list[0] 4738 else: 4739 log.error(f"No sample found") 4740 raise ValueError(f"No sample found") 4741 4742 # Create subject 4743 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4744 4745 # Add to dict 4746 param_exomiser_analysis_dict["phenopacket"][ 4747 "subject" 4748 ] = param_exomiser_subject 4749 4750 ### Add "phenotypicFeatures" ### 4751 4752 # If phenotypicFeatures exists 4753 param_exomiser_phenotypicfeatures = param_exomiser.get( 4754 "phenotypicFeatures", [] 4755 ) 4756 4757 # If phenotypicFeatures not exists -> Try to infer from hpo list 4758 if not param_exomiser_phenotypicfeatures: 4759 4760 # Found HPO in param 4761 param_exomiser_hpo = param_exomiser.get("hpo", []) 4762 4763 # Split HPO if list in string format separated by comma 4764 if isinstance(param_exomiser_hpo, str): 4765 param_exomiser_hpo = param_exomiser_hpo.split(",") 4766 4767 # Create HPO list 4768 for hpo in param_exomiser_hpo: 4769 hpo_clean = re.sub("[^0-9]", "", hpo) 4770 param_exomiser_phenotypicfeatures.append( 4771 { 4772 "type": { 4773 "id": f"HP:{hpo_clean}", 4774 "label": f"HP:{hpo_clean}", 4775 } 4776 } 4777 ) 4778 4779 # Add to dict 4780 param_exomiser_analysis_dict["phenopacket"][ 4781 "phenotypicFeatures" 4782 ] = param_exomiser_phenotypicfeatures 4783 4784 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4785 if not param_exomiser_phenotypicfeatures: 4786 for step in param_exomiser_analysis_dict.get( 4787 "analysis", {} 4788 ).get("steps", []): 4789 if "hiPhivePrioritiser" in step: 4790 param_exomiser_analysis_dict.get("analysis", {}).get( 4791 "steps", [] 4792 ).remove(step) 4793 4794 ### Add Input File ### 4795 4796 # Initial file name and htsFiles 4797 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4798 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4799 { 4800 "uri": tmp_vcf_name, 4801 "htsFormat": "VCF", 4802 "genomeAssembly": assembly, 4803 } 4804 ] 4805 4806 ### Add metaData ### 4807 4808 # If metaData not in analysis dict 4809 if "metaData" not in param_exomiser_analysis_dict: 4810 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4811 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4812 "createdBy": "howard", 4813 "phenopacketSchemaVersion": 1, 4814 } 4815 4816 ### OutputOptions ### 4817 4818 # Init output result folder 4819 output_results = os.path.join(tmp_dir, "results") 4820 4821 # If no outputOptions in analysis dict 4822 if "outputOptions" not in param_exomiser_analysis_dict: 4823 4824 # default output formats 4825 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4826 4827 # Get outputOptions in param 4828 output_options = param_exomiser.get("outputOptions", None) 4829 4830 # If no output_options in param -> check 4831 if not output_options: 4832 output_options = { 4833 "outputContributingVariantsOnly": False, 4834 "numGenes": 0, 4835 "outputFormats": defaut_output_formats, 4836 } 4837 4838 # Replace outputDirectory in output options 4839 output_options["outputDirectory"] = output_results 4840 output_options["outputFileName"] = "howard" 4841 4842 # Add outputOptions in analysis dict 4843 param_exomiser_analysis_dict["outputOptions"] = output_options 4844 4845 else: 4846 4847 # Replace output_results and output format (if exists in param) 4848 param_exomiser_analysis_dict["outputOptions"][ 4849 "outputDirectory" 4850 ] = output_results 4851 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4852 list( 4853 set( 4854 param_exomiser_analysis_dict.get( 4855 "outputOptions", {} 4856 ).get("outputFormats", []) 4857 + ["TSV_VARIANT", "VCF"] 4858 ) 4859 ) 4860 ) 4861 4862 # log 4863 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4864 4865 ### ANALYSIS FILE ### 4866 ##################### 4867 4868 ### Full JSON analysis config file ### 4869 4870 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4871 with open(exomiser_analysis, "w") as fp: 4872 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4873 4874 ### SPLIT analysis and sample config files 4875 4876 # Splitted analysis dict 4877 param_exomiser_analysis_dict_for_split = ( 4878 param_exomiser_analysis_dict.copy() 4879 ) 4880 4881 # Phenopacket JSON file 4882 exomiser_analysis_phenopacket = os.path.join( 4883 tmp_dir, "analysis_phenopacket.json" 4884 ) 4885 with open(exomiser_analysis_phenopacket, "w") as fp: 4886 json.dump( 4887 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4888 fp, 4889 indent=4, 4890 ) 4891 4892 # Analysis JSON file without Phenopacket parameters 4893 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4894 exomiser_analysis_analysis = os.path.join( 4895 tmp_dir, "analysis_analysis.json" 4896 ) 4897 with open(exomiser_analysis_analysis, "w") as fp: 4898 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4899 4900 ### INITAL VCF file ### 4901 ####################### 4902 4903 ### Create list of samples to use and include inti initial VCF file #### 4904 4905 # Subject (main sample) 4906 # Get sample ID in analysis dict 4907 sample_subject = ( 4908 param_exomiser_analysis_dict.get("phenopacket", {}) 4909 .get("subject", {}) 4910 .get("id", None) 4911 ) 4912 sample_proband = ( 4913 param_exomiser_analysis_dict.get("phenopacket", {}) 4914 .get("proband", {}) 4915 .get("subject", {}) 4916 .get("id", None) 4917 ) 4918 sample = [] 4919 if sample_subject: 4920 sample.append(sample_subject) 4921 if sample_proband: 4922 sample.append(sample_proband) 4923 4924 # Get sample ID within Pedigree 4925 pedigree_persons_list = ( 4926 param_exomiser_analysis_dict.get("phenopacket", {}) 4927 .get("pedigree", {}) 4928 .get("persons", {}) 4929 ) 4930 4931 # Create list with all sample ID in pedigree (if exists) 4932 pedigree_persons = [] 4933 for person in pedigree_persons_list: 4934 pedigree_persons.append(person.get("individualId")) 4935 4936 # Concat subject sample ID and samples ID in pedigreesamples 4937 samples = list(set(sample + pedigree_persons)) 4938 4939 # Check if sample list is not empty 4940 if not samples: 4941 log.error(f"No samples found") 4942 raise ValueError(f"No samples found") 4943 4944 # Create VCF with sample (either sample in param or first one by default) 4945 # Export VCF file 4946 self.export_variant_vcf( 4947 vcf_file=tmp_vcf_name, 4948 remove_info=True, 4949 add_samples=True, 4950 list_samples=samples, 4951 index=False, 4952 ) 4953 4954 ### Execute Exomiser ### 4955 ######################## 4956 4957 # Init command 4958 exomiser_command = "" 4959 4960 # Command exomiser options 4961 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4962 4963 # Release 4964 exomiser_release = param_exomiser.get("release", None) 4965 if exomiser_release: 4966 # phenotype data version 4967 exomiser_options += ( 4968 f" --exomiser.phenotype.data-version={exomiser_release} " 4969 ) 4970 # data version 4971 exomiser_options += ( 4972 f" --exomiser.{assembly}.data-version={exomiser_release} " 4973 ) 4974 # variant white list 4975 variant_white_list_file = ( 4976 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4977 ) 4978 if os.path.exists( 4979 os.path.join( 4980 databases_folders, assembly, variant_white_list_file 4981 ) 4982 ): 4983 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4984 4985 # transcript_source 4986 transcript_source = param_exomiser.get( 4987 "transcript_source", None 4988 ) # ucsc, refseq, ensembl 4989 if transcript_source: 4990 exomiser_options += ( 4991 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4992 ) 4993 4994 # If analysis contain proband param 4995 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4996 "proband", {} 4997 ): 4998 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4999 5000 # If no proband (usually uniq sample) 5001 else: 5002 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5003 5004 # Log 5005 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5006 5007 # Run command 5008 result = subprocess.call( 5009 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5010 ) 5011 if result: 5012 log.error("Exomiser command failed") 5013 raise ValueError("Exomiser command failed") 5014 5015 ### RESULTS ### 5016 ############### 5017 5018 ### Annotate with TSV fields ### 5019 5020 # Init result tsv file 5021 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5022 5023 # Init result tsv file 5024 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5025 5026 # Parse TSV file and explode columns in INFO field 5027 if exomiser_to_info and os.path.exists(output_results_tsv): 5028 5029 # Log 5030 log.debug("Exomiser columns to VCF INFO field") 5031 5032 # Retrieve columns and types 5033 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5034 output_results_tsv_df = self.get_query_to_df(query) 5035 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5036 5037 # Init concat fields for update 5038 sql_query_update_concat_fields = [] 5039 5040 # Fields to avoid 5041 fields_to_avoid = [ 5042 "CONTIG", 5043 "START", 5044 "END", 5045 "REF", 5046 "ALT", 5047 "QUAL", 5048 "FILTER", 5049 "GENOTYPE", 5050 ] 5051 5052 # List all columns to add into header 5053 for header_column in output_results_tsv_columns: 5054 5055 # If header column is enable 5056 if header_column not in fields_to_avoid: 5057 5058 # Header info type 5059 header_info_type = "String" 5060 header_column_df = output_results_tsv_df[header_column] 5061 header_column_df_dtype = header_column_df.dtype 5062 if header_column_df_dtype == object: 5063 if ( 5064 pd.to_numeric(header_column_df, errors="coerce") 5065 .notnull() 5066 .all() 5067 ): 5068 header_info_type = "Float" 5069 else: 5070 header_info_type = "Integer" 5071 5072 # Header info 5073 characters_to_validate = ["-"] 5074 pattern = "[" + "".join(characters_to_validate) + "]" 5075 header_info_name = re.sub( 5076 pattern, 5077 "_", 5078 f"Exomiser_{header_column}".replace("#", ""), 5079 ) 5080 header_info_number = "." 5081 header_info_description = ( 5082 f"Exomiser {header_column} annotation" 5083 ) 5084 header_info_source = "Exomiser" 5085 header_info_version = "unknown" 5086 header_info_code = CODE_TYPE_MAP[header_info_type] 5087 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5088 header_info_name, 5089 header_info_number, 5090 header_info_type, 5091 header_info_description, 5092 header_info_source, 5093 header_info_version, 5094 header_info_code, 5095 ) 5096 5097 # Add field to add for update to concat fields 5098 sql_query_update_concat_fields.append( 5099 f""" 5100 CASE 5101 WHEN table_parquet."{header_column}" NOT IN ('','.') 5102 THEN concat( 5103 '{header_info_name}=', 5104 table_parquet."{header_column}", 5105 ';' 5106 ) 5107 5108 ELSE '' 5109 END 5110 """ 5111 ) 5112 5113 # Update query 5114 sql_query_update = f""" 5115 UPDATE {table_variants} as table_variants 5116 SET INFO = concat( 5117 CASE 5118 WHEN INFO NOT IN ('', '.') 5119 THEN INFO 5120 ELSE '' 5121 END, 5122 CASE 5123 WHEN table_variants.INFO NOT IN ('','.') 5124 THEN ';' 5125 ELSE '' 5126 END, 5127 ( 5128 SELECT 5129 concat( 5130 {",".join(sql_query_update_concat_fields)} 5131 ) 5132 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5133 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5134 AND table_parquet.\"START\" = table_variants.\"POS\" 5135 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5136 AND table_parquet.\"REF\" = table_variants.\"REF\" 5137 ) 5138 ) 5139 ; 5140 """ 5141 5142 # Update 5143 self.conn.execute(sql_query_update) 5144 5145 ### Annotate with VCF INFO field ### 5146 5147 # Init result VCF file 5148 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5149 5150 # If VCF exists 5151 if os.path.exists(output_results_vcf): 5152 5153 # Log 5154 log.debug("Exomiser result VCF update variants") 5155 5156 # Find Exomiser INFO field annotation in header 5157 with gzip.open(output_results_vcf, "rt") as f: 5158 header_list = self.read_vcf_header(f) 5159 exomiser_vcf_header = vcf.Reader( 5160 io.StringIO("\n".join(header_list)) 5161 ) 5162 5163 # Add annotation INFO field to header 5164 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5165 5166 # Update variants with VCF 5167 self.update_from_vcf(output_results_vcf) 5168 5169 return True 5170 5171 def annotation_snpeff(self, threads: int = None) -> None: 5172 """ 5173 This function annotate with snpEff 5174 5175 :param threads: The number of threads to use 5176 :return: the value of the variable "return_value". 5177 """ 5178 5179 # DEBUG 5180 log.debug("Start annotation with snpeff databases") 5181 5182 # Threads 5183 if not threads: 5184 threads = self.get_threads() 5185 log.debug("Threads: " + str(threads)) 5186 5187 # DEBUG 5188 delete_tmp = True 5189 if self.get_config().get("verbosity", "warning") in ["debug"]: 5190 delete_tmp = False 5191 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5192 5193 # Config 5194 config = self.get_config() 5195 log.debug("Config: " + str(config)) 5196 5197 # Config - Folders - Databases 5198 databases_folders = ( 5199 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5200 ) 5201 log.debug("Databases annotations: " + str(databases_folders)) 5202 5203 # Config - snpEff bin command 5204 snpeff_bin_command = get_bin_command( 5205 bin="snpEff.jar", 5206 tool="snpeff", 5207 bin_type="jar", 5208 config=config, 5209 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5210 ) 5211 if not snpeff_bin_command: 5212 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5213 log.error(msg_err) 5214 raise ValueError(msg_err) 5215 5216 # Config - snpEff databases 5217 snpeff_databases = ( 5218 config.get("folders", {}) 5219 .get("databases", {}) 5220 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5221 ) 5222 snpeff_databases = full_path(snpeff_databases) 5223 if snpeff_databases is not None and snpeff_databases != "": 5224 log.debug(f"Create snpEff databases folder") 5225 if not os.path.exists(snpeff_databases): 5226 os.makedirs(snpeff_databases) 5227 5228 # Param 5229 param = self.get_param() 5230 log.debug("Param: " + str(param)) 5231 5232 # Param 5233 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5234 log.debug("Options: " + str(options)) 5235 5236 # Param - Assembly 5237 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5238 5239 # Param - Options 5240 snpeff_options = ( 5241 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5242 ) 5243 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5244 snpeff_csvstats = ( 5245 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5246 ) 5247 if snpeff_stats: 5248 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5249 snpeff_stats = full_path(snpeff_stats) 5250 snpeff_options += f" -stats {snpeff_stats}" 5251 if snpeff_csvstats: 5252 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5253 snpeff_csvstats = full_path(snpeff_csvstats) 5254 snpeff_options += f" -csvStats {snpeff_csvstats}" 5255 5256 # Data 5257 table_variants = self.get_table_variants() 5258 5259 # Check if not empty 5260 log.debug("Check if not empty") 5261 sql_query_chromosomes = ( 5262 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5263 ) 5264 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5265 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5266 log.info(f"VCF empty") 5267 return 5268 5269 # Export in VCF 5270 log.debug("Create initial file to annotate") 5271 tmp_vcf = NamedTemporaryFile( 5272 prefix=self.get_prefix(), 5273 dir=self.get_tmp_dir(), 5274 suffix=".vcf.gz", 5275 delete=True, 5276 ) 5277 tmp_vcf_name = tmp_vcf.name 5278 5279 # VCF header 5280 vcf_reader = self.get_header() 5281 log.debug("Initial header: " + str(vcf_reader.infos)) 5282 5283 # Existing annotations 5284 for vcf_annotation in self.get_header().infos: 5285 5286 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5287 log.debug( 5288 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5289 ) 5290 5291 # Memory limit 5292 # if config.get("memory", None): 5293 # memory_limit = config.get("memory", "8G") 5294 # else: 5295 # memory_limit = "8G" 5296 memory_limit = self.get_memory("8G") 5297 log.debug(f"memory_limit: {memory_limit}") 5298 5299 # snpEff java options 5300 snpeff_java_options = ( 5301 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5302 ) 5303 log.debug(f"Exomiser java options: {snpeff_java_options}") 5304 5305 force_update_annotation = True 5306 5307 if "ANN" not in self.get_header().infos or force_update_annotation: 5308 5309 # Check snpEff database 5310 log.debug(f"Check snpEff databases {[assembly]}") 5311 databases_download_snpeff( 5312 folder=snpeff_databases, assemblies=[assembly], config=config 5313 ) 5314 5315 # Export VCF file 5316 self.export_variant_vcf( 5317 vcf_file=tmp_vcf_name, 5318 remove_info=True, 5319 add_samples=False, 5320 index=True, 5321 ) 5322 5323 # Tmp file 5324 err_files = [] 5325 tmp_annotate_vcf = NamedTemporaryFile( 5326 prefix=self.get_prefix(), 5327 dir=self.get_tmp_dir(), 5328 suffix=".vcf", 5329 delete=False, 5330 ) 5331 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5332 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5333 err_files.append(tmp_annotate_vcf_name_err) 5334 5335 # Command 5336 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5337 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5338 run_parallel_commands([snpeff_command], 1) 5339 5340 # Error messages 5341 log.info(f"Error/Warning messages:") 5342 error_message_command_all = [] 5343 error_message_command_warning = [] 5344 error_message_command_err = [] 5345 for err_file in err_files: 5346 with open(err_file, "r") as f: 5347 for line in f: 5348 message = line.strip() 5349 error_message_command_all.append(message) 5350 if line.startswith("[W::"): 5351 error_message_command_warning.append(message) 5352 if line.startswith("[E::"): 5353 error_message_command_err.append(f"{err_file}: " + message) 5354 # log info 5355 for message in list( 5356 set(error_message_command_err + error_message_command_warning) 5357 ): 5358 log.info(f" {message}") 5359 # debug info 5360 for message in list(set(error_message_command_all)): 5361 log.debug(f" {message}") 5362 # failed 5363 if len(error_message_command_err): 5364 log.error("Annotation failed: Error in commands") 5365 raise ValueError("Annotation failed: Error in commands") 5366 5367 # Find annotation in header 5368 with open(tmp_annotate_vcf_name, "rt") as f: 5369 header_list = self.read_vcf_header(f) 5370 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5371 5372 for ann in annovar_vcf_header.infos: 5373 if ann not in self.get_header().infos: 5374 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5375 5376 # Update variants 5377 log.info(f"Annotation - Updating...") 5378 self.update_from_vcf(tmp_annotate_vcf_name) 5379 5380 else: 5381 if "ANN" in self.get_header().infos: 5382 log.debug(f"Existing snpEff annotations in VCF") 5383 if force_update_annotation: 5384 log.debug(f"Existing snpEff annotations in VCF - annotation forced") 5385 5386 def annotation_annovar(self, threads: int = None) -> None: 5387 """ 5388 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5389 annotations 5390 5391 :param threads: number of threads to use 5392 :return: the value of the variable "return_value". 5393 """ 5394 5395 # DEBUG 5396 log.debug("Start annotation with Annovar databases") 5397 5398 # Threads 5399 if not threads: 5400 threads = self.get_threads() 5401 log.debug("Threads: " + str(threads)) 5402 5403 # Tmp en Err files 5404 tmp_files = [] 5405 err_files = [] 5406 5407 # DEBUG 5408 delete_tmp = True 5409 if self.get_config().get("verbosity", "warning") in ["debug"]: 5410 delete_tmp = False 5411 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5412 5413 # Config 5414 config = self.get_config() 5415 log.debug("Config: " + str(config)) 5416 5417 # Config - Folders - Databases 5418 databases_folders = ( 5419 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5420 ) 5421 log.debug("Databases annotations: " + str(databases_folders)) 5422 5423 # Config - annovar bin command 5424 annovar_bin_command = get_bin_command( 5425 bin="table_annovar.pl", 5426 tool="annovar", 5427 bin_type="perl", 5428 config=config, 5429 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5430 ) 5431 if not annovar_bin_command: 5432 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5433 log.error(msg_err) 5434 raise ValueError(msg_err) 5435 5436 # Config - BCFTools bin command 5437 bcftools_bin_command = get_bin_command( 5438 bin="bcftools", 5439 tool="bcftools", 5440 bin_type="bin", 5441 config=config, 5442 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5443 ) 5444 if not bcftools_bin_command: 5445 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5446 log.error(msg_err) 5447 raise ValueError(msg_err) 5448 5449 # Config - annovar databases 5450 annovar_databases = ( 5451 config.get("folders", {}) 5452 .get("databases", {}) 5453 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5454 ) 5455 if annovar_databases is not None: 5456 if isinstance(annovar_databases, list): 5457 annovar_databases = full_path(annovar_databases[0]) 5458 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5459 annovar_databases = full_path(annovar_databases) 5460 if not os.path.exists(annovar_databases): 5461 log.info(f"Annovar databases folder '{annovar_databases}' created") 5462 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5463 else: 5464 msg_err = f"Annovar databases configuration failed" 5465 log.error(msg_err) 5466 raise ValueError(msg_err) 5467 5468 # Param 5469 param = self.get_param() 5470 log.debug("Param: " + str(param)) 5471 5472 # Param - options 5473 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5474 log.debug("Options: " + str(options)) 5475 5476 # Param - annotations 5477 annotations = ( 5478 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5479 ) 5480 log.debug("Annotations: " + str(annotations)) 5481 5482 # Param - Assembly 5483 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5484 5485 # Annovar database assembly 5486 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5487 if annovar_databases_assembly != "" and not os.path.exists( 5488 annovar_databases_assembly 5489 ): 5490 os.makedirs(annovar_databases_assembly) 5491 5492 # Data 5493 table_variants = self.get_table_variants() 5494 5495 # Check if not empty 5496 log.debug("Check if not empty") 5497 sql_query_chromosomes = ( 5498 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5499 ) 5500 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5501 if not sql_query_chromosomes_df["count"][0]: 5502 log.info(f"VCF empty") 5503 return 5504 5505 # VCF header 5506 vcf_reader = self.get_header() 5507 log.debug("Initial header: " + str(vcf_reader.infos)) 5508 5509 # Existing annotations 5510 for vcf_annotation in self.get_header().infos: 5511 5512 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5513 log.debug( 5514 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5515 ) 5516 5517 force_update_annotation = True 5518 5519 if annotations: 5520 5521 commands = [] 5522 tmp_annotates_vcf_name_list = [] 5523 5524 # Export in VCF 5525 log.debug("Create initial file to annotate") 5526 tmp_vcf = NamedTemporaryFile( 5527 prefix=self.get_prefix(), 5528 dir=self.get_tmp_dir(), 5529 suffix=".vcf.gz", 5530 delete=False, 5531 ) 5532 tmp_vcf_name = tmp_vcf.name 5533 tmp_files.append(tmp_vcf_name) 5534 tmp_files.append(tmp_vcf_name + ".tbi") 5535 5536 # Export VCF file 5537 self.export_variant_vcf( 5538 vcf_file=tmp_vcf_name, 5539 remove_info=".", 5540 add_samples=False, 5541 index=True, 5542 ) 5543 5544 # Create file for field rename 5545 log.debug("Create file for field rename") 5546 tmp_rename = NamedTemporaryFile( 5547 prefix=self.get_prefix(), 5548 dir=self.get_tmp_dir(), 5549 suffix=".rename", 5550 delete=False, 5551 ) 5552 tmp_rename_name = tmp_rename.name 5553 tmp_files.append(tmp_rename_name) 5554 5555 # Check Annovar database 5556 log.debug( 5557 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5558 ) 5559 databases_download_annovar( 5560 folder=annovar_databases, 5561 files=list(annotations.keys()), 5562 assemblies=[assembly], 5563 ) 5564 5565 for annotation in annotations: 5566 annotation_fields = annotations[annotation] 5567 5568 if not annotation_fields: 5569 annotation_fields = {"INFO": None} 5570 5571 log.info(f"Annotations Annovar - database '{annotation}'") 5572 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5573 5574 # Tmp file for annovar 5575 err_files = [] 5576 tmp_annotate_vcf_directory = TemporaryDirectory( 5577 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5578 ) 5579 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5580 tmp_annotate_vcf_name_annovar = ( 5581 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5582 ) 5583 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5584 err_files.append(tmp_annotate_vcf_name_err) 5585 tmp_files.append(tmp_annotate_vcf_name_err) 5586 5587 # Tmp file final vcf annotated by annovar 5588 tmp_annotate_vcf = NamedTemporaryFile( 5589 prefix=self.get_prefix(), 5590 dir=self.get_tmp_dir(), 5591 suffix=".vcf.gz", 5592 delete=False, 5593 ) 5594 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5595 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5596 tmp_files.append(tmp_annotate_vcf_name) 5597 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5598 5599 # Number of fields 5600 annotation_list = [] 5601 annotation_renamed_list = [] 5602 5603 for annotation_field in annotation_fields: 5604 5605 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5606 annotation_fields_new_name = annotation_fields.get( 5607 annotation_field, annotation_field 5608 ) 5609 if not annotation_fields_new_name: 5610 annotation_fields_new_name = annotation_field 5611 5612 if ( 5613 force_update_annotation 5614 or annotation_fields_new_name not in self.get_header().infos 5615 ): 5616 annotation_list.append(annotation_field) 5617 annotation_renamed_list.append(annotation_fields_new_name) 5618 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5619 log.warning( 5620 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5621 ) 5622 5623 # Add rename info 5624 run_parallel_commands( 5625 [ 5626 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5627 ], 5628 1, 5629 ) 5630 5631 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5632 log.debug("annotation_list: " + str(annotation_list)) 5633 5634 # protocol 5635 protocol = annotation 5636 5637 # argument 5638 argument = "" 5639 5640 # operation 5641 operation = "f" 5642 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5643 "ensGene" 5644 ): 5645 operation = "g" 5646 if options.get("genebase", None): 5647 argument = f"""'{options.get("genebase","")}'""" 5648 elif annotation in ["cytoBand"]: 5649 operation = "r" 5650 5651 # argument option 5652 argument_option = "" 5653 if argument != "": 5654 argument_option = " --argument " + argument 5655 5656 # command options 5657 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5658 for option in options: 5659 if option not in ["genebase"]: 5660 command_options += f""" --{option}={options[option]}""" 5661 5662 # Command 5663 5664 # Command - Annovar 5665 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5666 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5667 5668 # Command - start pipe 5669 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5670 5671 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5672 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5673 5674 # Command - Special characters (refGene annotation) 5675 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5676 5677 # Command - Clean empty fields (with value ".") 5678 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5679 5680 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5681 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5682 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5683 # for ann in annotation_renamed_list: 5684 for ann in annotation_list: 5685 annovar_fields_to_keep.append(f"^INFO/{ann}") 5686 5687 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5688 5689 # Command - indexing 5690 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5691 5692 log.debug(f"Annotation - Annovar command: {command_annovar}") 5693 run_parallel_commands([command_annovar], 1) 5694 5695 # Error messages 5696 log.info(f"Error/Warning messages:") 5697 error_message_command_all = [] 5698 error_message_command_warning = [] 5699 error_message_command_err = [] 5700 for err_file in err_files: 5701 with open(err_file, "r") as f: 5702 for line in f: 5703 message = line.strip() 5704 error_message_command_all.append(message) 5705 if line.startswith("[W::") or line.startswith("WARNING"): 5706 error_message_command_warning.append(message) 5707 if line.startswith("[E::") or line.startswith("ERROR"): 5708 error_message_command_err.append( 5709 f"{err_file}: " + message 5710 ) 5711 # log info 5712 for message in list( 5713 set(error_message_command_err + error_message_command_warning) 5714 ): 5715 log.info(f" {message}") 5716 # debug info 5717 for message in list(set(error_message_command_all)): 5718 log.debug(f" {message}") 5719 # failed 5720 if len(error_message_command_err): 5721 log.error("Annotation failed: Error in commands") 5722 raise ValueError("Annotation failed: Error in commands") 5723 5724 if tmp_annotates_vcf_name_list: 5725 5726 # List of annotated files 5727 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5728 5729 # Tmp file 5730 tmp_annotate_vcf = NamedTemporaryFile( 5731 prefix=self.get_prefix(), 5732 dir=self.get_tmp_dir(), 5733 suffix=".vcf.gz", 5734 delete=False, 5735 ) 5736 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5737 tmp_files.append(tmp_annotate_vcf_name) 5738 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5739 err_files.append(tmp_annotate_vcf_name_err) 5740 tmp_files.append(tmp_annotate_vcf_name_err) 5741 5742 # Command merge 5743 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5744 log.info( 5745 f"Annotation Annovar - Annotation merging " 5746 + str(len(tmp_annotates_vcf_name_list)) 5747 + " annotated files" 5748 ) 5749 log.debug(f"Annotation - merge command: {merge_command}") 5750 run_parallel_commands([merge_command], 1) 5751 5752 # Find annotation in header 5753 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5754 header_list = self.read_vcf_header(f) 5755 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5756 5757 for ann in annovar_vcf_header.infos: 5758 if ann not in self.get_header().infos: 5759 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5760 5761 # Update variants 5762 log.info(f"Annotation Annovar - Updating...") 5763 self.update_from_vcf(tmp_annotate_vcf_name) 5764 5765 # Clean files 5766 # Tmp file remove command 5767 if True: 5768 tmp_files_remove_command = "" 5769 if tmp_files: 5770 tmp_files_remove_command = " ".join(tmp_files) 5771 clean_command = f" rm -f {tmp_files_remove_command} " 5772 log.debug(f"Annotation Annovar - Annotation cleaning ") 5773 log.debug(f"Annotation - cleaning command: {clean_command}") 5774 run_parallel_commands([clean_command], 1) 5775 5776 # Parquet 5777 def annotation_parquet(self, threads: int = None) -> None: 5778 """ 5779 It takes a VCF file, and annotates it with a parquet file 5780 5781 :param threads: number of threads to use for the annotation 5782 :return: the value of the variable "result". 5783 """ 5784 5785 # DEBUG 5786 log.debug("Start annotation with parquet databases") 5787 5788 # Threads 5789 if not threads: 5790 threads = self.get_threads() 5791 log.debug("Threads: " + str(threads)) 5792 5793 # DEBUG 5794 delete_tmp = True 5795 if self.get_config().get("verbosity", "warning") in ["debug"]: 5796 delete_tmp = False 5797 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5798 5799 # Config 5800 databases_folders = set( 5801 self.get_config() 5802 .get("folders", {}) 5803 .get("databases", {}) 5804 .get("annotations", ["."]) 5805 + self.get_config() 5806 .get("folders", {}) 5807 .get("databases", {}) 5808 .get("parquet", ["."]) 5809 ) 5810 log.debug("Databases annotations: " + str(databases_folders)) 5811 5812 # Param 5813 annotations = ( 5814 self.get_param() 5815 .get("annotation", {}) 5816 .get("parquet", {}) 5817 .get("annotations", None) 5818 ) 5819 log.debug("Annotations: " + str(annotations)) 5820 5821 # Assembly 5822 assembly = self.get_param().get( 5823 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5824 ) 5825 5826 # Force Update Annotation 5827 force_update_annotation = ( 5828 self.get_param() 5829 .get("annotation", {}) 5830 .get("options", {}) 5831 .get("annotations_update", False) 5832 ) 5833 log.debug(f"force_update_annotation={force_update_annotation}") 5834 force_append_annotation = ( 5835 self.get_param() 5836 .get("annotation", {}) 5837 .get("options", {}) 5838 .get("annotations_append", False) 5839 ) 5840 log.debug(f"force_append_annotation={force_append_annotation}") 5841 5842 # Data 5843 table_variants = self.get_table_variants() 5844 5845 # Check if not empty 5846 log.debug("Check if not empty") 5847 sql_query_chromosomes_df = self.get_query_to_df( 5848 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5849 ) 5850 if not sql_query_chromosomes_df["count"][0]: 5851 log.info(f"VCF empty") 5852 return 5853 5854 # VCF header 5855 vcf_reader = self.get_header() 5856 log.debug("Initial header: " + str(vcf_reader.infos)) 5857 5858 # Nb Variants POS 5859 log.debug("NB Variants Start") 5860 nb_variants = self.conn.execute( 5861 f"SELECT count(*) AS count FROM variants" 5862 ).fetchdf()["count"][0] 5863 log.debug("NB Variants Stop") 5864 5865 # Existing annotations 5866 for vcf_annotation in self.get_header().infos: 5867 5868 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5869 log.debug( 5870 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5871 ) 5872 5873 # Added columns 5874 added_columns = [] 5875 5876 # drop indexes 5877 log.debug(f"Drop indexes...") 5878 self.drop_indexes() 5879 5880 if annotations: 5881 5882 if "ALL" in annotations: 5883 5884 all_param = annotations.get("ALL", {}) 5885 all_param_formats = all_param.get("formats", None) 5886 all_param_releases = all_param.get("releases", None) 5887 5888 databases_infos_dict = self.scan_databases( 5889 database_formats=all_param_formats, 5890 database_releases=all_param_releases, 5891 ) 5892 for database_infos in databases_infos_dict.keys(): 5893 if database_infos not in annotations: 5894 annotations[database_infos] = {"INFO": None} 5895 5896 for annotation in annotations: 5897 5898 if annotation in ["ALL"]: 5899 continue 5900 5901 # Annotation Name 5902 annotation_name = os.path.basename(annotation) 5903 5904 # Annotation fields 5905 annotation_fields = annotations[annotation] 5906 if not annotation_fields: 5907 annotation_fields = {"INFO": None} 5908 5909 log.debug(f"Annotation '{annotation_name}'") 5910 log.debug( 5911 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5912 ) 5913 5914 # Create Database 5915 database = Database( 5916 database=annotation, 5917 databases_folders=databases_folders, 5918 assembly=assembly, 5919 ) 5920 5921 # Find files 5922 parquet_file = database.get_database() 5923 parquet_hdr_file = database.get_header_file() 5924 parquet_type = database.get_type() 5925 5926 # Check if files exists 5927 if not parquet_file or not parquet_hdr_file: 5928 msg_err_list = [] 5929 if not parquet_file: 5930 msg_err_list.append( 5931 f"Annotation failed: Annotation file not found" 5932 ) 5933 if parquet_file and not parquet_hdr_file: 5934 msg_err_list.append( 5935 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 5936 ) 5937 5938 log.error(". ".join(msg_err_list)) 5939 raise ValueError(". ".join(msg_err_list)) 5940 else: 5941 # Get parquet connexion 5942 parquet_sql_attach = database.get_sql_database_attach( 5943 output="query" 5944 ) 5945 if parquet_sql_attach: 5946 self.conn.execute(parquet_sql_attach) 5947 parquet_file_link = database.get_sql_database_link() 5948 # Log 5949 log.debug( 5950 f"Annotation '{annotation_name}' - file: " 5951 + str(parquet_file) 5952 + " and " 5953 + str(parquet_hdr_file) 5954 ) 5955 5956 # Database full header columns 5957 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5958 parquet_hdr_file 5959 ) 5960 # Log 5961 log.debug( 5962 "Annotation database header columns : " 5963 + str(parquet_hdr_vcf_header_columns) 5964 ) 5965 5966 # Load header as VCF object 5967 parquet_hdr_vcf_header_infos = database.get_header().infos 5968 # Log 5969 log.debug( 5970 "Annotation database header: " 5971 + str(parquet_hdr_vcf_header_infos) 5972 ) 5973 5974 # Get extra infos 5975 parquet_columns = database.get_extra_columns() 5976 # Log 5977 log.debug("Annotation database Columns: " + str(parquet_columns)) 5978 5979 # Add extra columns if "ALL" in annotation_fields 5980 # if "ALL" in annotation_fields: 5981 # allow_add_extra_column = True 5982 if "ALL" in annotation_fields and database.get_extra_columns(): 5983 for extra_column in database.get_extra_columns(): 5984 if ( 5985 extra_column not in annotation_fields 5986 and extra_column.replace("INFO/", "") 5987 not in parquet_hdr_vcf_header_infos 5988 ): 5989 parquet_hdr_vcf_header_infos[extra_column] = ( 5990 vcf.parser._Info( 5991 extra_column, 5992 ".", 5993 "String", 5994 f"{extra_column} description", 5995 "unknown", 5996 "unknown", 5997 self.code_type_map["String"], 5998 ) 5999 ) 6000 6001 # For all fields in database 6002 annotation_fields_all = False 6003 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6004 annotation_fields_all = True 6005 annotation_fields = { 6006 key: key for key in parquet_hdr_vcf_header_infos 6007 } 6008 6009 log.debug( 6010 "Annotation database header - All annotations added: " 6011 + str(annotation_fields) 6012 ) 6013 6014 # Init 6015 6016 # List of annotation fields to use 6017 sql_query_annotation_update_info_sets = [] 6018 6019 # List of annotation to agregate 6020 sql_query_annotation_to_agregate = [] 6021 6022 # Number of fields 6023 nb_annotation_field = 0 6024 6025 # Annotation fields processed 6026 annotation_fields_processed = [] 6027 6028 # Columns mapping 6029 map_columns = database.map_columns( 6030 columns=annotation_fields, prefixes=["INFO/"] 6031 ) 6032 6033 # Query dict for fields to remove (update option) 6034 query_dict_remove = {} 6035 6036 # Fetch Anotation fields 6037 for annotation_field in annotation_fields: 6038 6039 # annotation_field_column 6040 annotation_field_column = map_columns.get( 6041 annotation_field, "INFO" 6042 ) 6043 6044 # field new name, if parametered 6045 annotation_fields_new_name = annotation_fields.get( 6046 annotation_field, annotation_field 6047 ) 6048 if not annotation_fields_new_name: 6049 annotation_fields_new_name = annotation_field 6050 6051 # To annotate 6052 # force_update_annotation = True 6053 # force_append_annotation = True 6054 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6055 if annotation_field in parquet_hdr_vcf_header_infos and ( 6056 force_update_annotation 6057 or force_append_annotation 6058 or ( 6059 annotation_fields_new_name 6060 not in self.get_header().infos 6061 ) 6062 ): 6063 6064 # Add field to annotation to process list 6065 annotation_fields_processed.append( 6066 annotation_fields_new_name 6067 ) 6068 6069 # explode infos for the field 6070 annotation_fields_new_name_info_msg = "" 6071 if ( 6072 force_update_annotation 6073 and annotation_fields_new_name 6074 in self.get_header().infos 6075 ): 6076 # Remove field from INFO 6077 query = f""" 6078 UPDATE {table_variants} as table_variants 6079 SET INFO = REGEXP_REPLACE( 6080 concat(table_variants.INFO,''), 6081 ';*{annotation_fields_new_name}=[^;]*', 6082 '' 6083 ) 6084 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6085 """ 6086 annotation_fields_new_name_info_msg = " [update]" 6087 query_dict_remove[ 6088 f"remove 'INFO/{annotation_fields_new_name}'" 6089 ] = query 6090 6091 # Sep between fields in INFO 6092 nb_annotation_field += 1 6093 if nb_annotation_field > 1: 6094 annotation_field_sep = ";" 6095 else: 6096 annotation_field_sep = "" 6097 6098 log.info( 6099 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6100 ) 6101 6102 # Add INFO field to header 6103 parquet_hdr_vcf_header_infos_number = ( 6104 parquet_hdr_vcf_header_infos[annotation_field].num 6105 or "." 6106 ) 6107 parquet_hdr_vcf_header_infos_type = ( 6108 parquet_hdr_vcf_header_infos[annotation_field].type 6109 or "String" 6110 ) 6111 parquet_hdr_vcf_header_infos_description = ( 6112 parquet_hdr_vcf_header_infos[annotation_field].desc 6113 or f"{annotation_field} description" 6114 ) 6115 parquet_hdr_vcf_header_infos_source = ( 6116 parquet_hdr_vcf_header_infos[annotation_field].source 6117 or "unknown" 6118 ) 6119 parquet_hdr_vcf_header_infos_version = ( 6120 parquet_hdr_vcf_header_infos[annotation_field].version 6121 or "unknown" 6122 ) 6123 6124 vcf_reader.infos[annotation_fields_new_name] = ( 6125 vcf.parser._Info( 6126 annotation_fields_new_name, 6127 parquet_hdr_vcf_header_infos_number, 6128 parquet_hdr_vcf_header_infos_type, 6129 parquet_hdr_vcf_header_infos_description, 6130 parquet_hdr_vcf_header_infos_source, 6131 parquet_hdr_vcf_header_infos_version, 6132 self.code_type_map[ 6133 parquet_hdr_vcf_header_infos_type 6134 ], 6135 ) 6136 ) 6137 6138 # Append 6139 if force_append_annotation: 6140 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6141 else: 6142 query_case_when_append = "" 6143 6144 # Annotation/Update query fields 6145 # Found in INFO column 6146 if ( 6147 annotation_field_column == "INFO" 6148 and "INFO" in parquet_hdr_vcf_header_columns 6149 ): 6150 sql_query_annotation_update_info_sets.append( 6151 f""" 6152 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6153 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6154 ELSE '' 6155 END 6156 """ 6157 ) 6158 # Found in a specific column 6159 else: 6160 sql_query_annotation_update_info_sets.append( 6161 f""" 6162 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6163 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6164 ELSE '' 6165 END 6166 """ 6167 ) 6168 sql_query_annotation_to_agregate.append( 6169 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6170 ) 6171 6172 # Not to annotate 6173 else: 6174 6175 if force_update_annotation: 6176 annotation_message = "forced" 6177 else: 6178 annotation_message = "skipped" 6179 6180 if annotation_field not in parquet_hdr_vcf_header_infos: 6181 log.warning( 6182 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6183 ) 6184 if annotation_fields_new_name in self.get_header().infos: 6185 log.warning( 6186 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6187 ) 6188 6189 # Check if ALL fields have to be annotated. Thus concat all INFO field 6190 # allow_annotation_full_info = True 6191 allow_annotation_full_info = not force_append_annotation 6192 6193 if parquet_type in ["regions"]: 6194 allow_annotation_full_info = False 6195 6196 if ( 6197 allow_annotation_full_info 6198 and nb_annotation_field == len(annotation_fields) 6199 and annotation_fields_all 6200 and ( 6201 "INFO" in parquet_hdr_vcf_header_columns 6202 and "INFO" in database.get_extra_columns() 6203 ) 6204 ): 6205 log.debug("Column INFO annotation enabled") 6206 sql_query_annotation_update_info_sets = [] 6207 sql_query_annotation_update_info_sets.append( 6208 f" table_parquet.INFO " 6209 ) 6210 6211 if sql_query_annotation_update_info_sets: 6212 6213 # Annotate 6214 log.info(f"Annotation '{annotation_name}' - Annotation...") 6215 6216 # Join query annotation update info sets for SQL 6217 sql_query_annotation_update_info_sets_sql = ",".join( 6218 sql_query_annotation_update_info_sets 6219 ) 6220 6221 # Check chromosomes list (and variants infos) 6222 sql_query_chromosomes = f""" 6223 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6224 FROM {table_variants} as table_variants 6225 GROUP BY table_variants."#CHROM" 6226 ORDER BY table_variants."#CHROM" 6227 """ 6228 sql_query_chromosomes_df = self.conn.execute( 6229 sql_query_chromosomes 6230 ).df() 6231 sql_query_chromosomes_dict = { 6232 entry["CHROM"]: { 6233 "count": entry["count_variants"], 6234 "min": entry["min_variants"], 6235 "max": entry["max_variants"], 6236 } 6237 for index, entry in sql_query_chromosomes_df.iterrows() 6238 } 6239 6240 # Init 6241 nb_of_query = 0 6242 nb_of_variant_annotated = 0 6243 query_dict = query_dict_remove 6244 6245 # for chrom in sql_query_chromosomes_df["CHROM"]: 6246 for chrom in sql_query_chromosomes_dict: 6247 6248 # Number of variant by chromosome 6249 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6250 chrom, {} 6251 ).get("count", 0) 6252 6253 log.debug( 6254 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6255 ) 6256 6257 # Annotation with regions database 6258 if parquet_type in ["regions"]: 6259 sql_query_annotation_from_clause = f""" 6260 FROM ( 6261 SELECT 6262 '{chrom}' AS \"#CHROM\", 6263 table_variants_from.\"POS\" AS \"POS\", 6264 {",".join(sql_query_annotation_to_agregate)} 6265 FROM {table_variants} as table_variants_from 6266 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6267 table_parquet_from."#CHROM" = '{chrom}' 6268 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6269 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 6270 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6271 ) 6272 ) 6273 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6274 GROUP BY table_variants_from.\"POS\" 6275 ) 6276 as table_parquet 6277 """ 6278 6279 sql_query_annotation_where_clause = """ 6280 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6281 AND table_parquet.\"POS\" = table_variants.\"POS\" 6282 """ 6283 6284 # Annotation with variants database 6285 else: 6286 sql_query_annotation_from_clause = f""" 6287 FROM {parquet_file_link} as table_parquet 6288 """ 6289 sql_query_annotation_where_clause = f""" 6290 table_variants."#CHROM" = '{chrom}' 6291 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6292 AND table_parquet.\"POS\" = table_variants.\"POS\" 6293 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6294 AND table_parquet.\"REF\" = table_variants.\"REF\" 6295 """ 6296 6297 # Create update query 6298 sql_query_annotation_chrom_interval_pos = f""" 6299 UPDATE {table_variants} as table_variants 6300 SET INFO = 6301 concat( 6302 CASE WHEN table_variants.INFO NOT IN ('','.') 6303 THEN table_variants.INFO 6304 ELSE '' 6305 END 6306 , 6307 CASE WHEN table_variants.INFO NOT IN ('','.') 6308 AND ( 6309 concat({sql_query_annotation_update_info_sets_sql}) 6310 ) 6311 NOT IN ('','.') 6312 THEN ';' 6313 ELSE '' 6314 END 6315 , 6316 {sql_query_annotation_update_info_sets_sql} 6317 ) 6318 {sql_query_annotation_from_clause} 6319 WHERE {sql_query_annotation_where_clause} 6320 ; 6321 """ 6322 6323 # Add update query to dict 6324 query_dict[ 6325 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6326 ] = sql_query_annotation_chrom_interval_pos 6327 6328 nb_of_query = len(query_dict) 6329 num_query = 0 6330 6331 # SET max_expression_depth TO x 6332 self.conn.execute("SET max_expression_depth TO 10000") 6333 6334 for query_name in query_dict: 6335 query = query_dict[query_name] 6336 num_query += 1 6337 log.info( 6338 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6339 ) 6340 result = self.conn.execute(query) 6341 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6342 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6343 log.info( 6344 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6345 ) 6346 6347 log.info( 6348 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6349 ) 6350 6351 else: 6352 6353 log.info( 6354 f"Annotation '{annotation_name}' - No Annotations available" 6355 ) 6356 6357 log.debug("Final header: " + str(vcf_reader.infos)) 6358 6359 # Remove added columns 6360 for added_column in added_columns: 6361 self.drop_column(column=added_column) 6362 6363 def annotation_splice(self, threads: int = None) -> None: 6364 """ 6365 This function annotate with snpEff 6366 6367 :param threads: The number of threads to use 6368 :return: the value of the variable "return_value". 6369 """ 6370 6371 # DEBUG 6372 log.debug("Start annotation with splice tools") 6373 6374 # Threads 6375 if not threads: 6376 threads = self.get_threads() 6377 log.debug("Threads: " + str(threads)) 6378 6379 # DEBUG 6380 delete_tmp = True 6381 if self.get_config().get("verbosity", "warning") in ["debug"]: 6382 delete_tmp = False 6383 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6384 6385 # Config 6386 config = self.get_config() 6387 log.debug("Config: " + str(config)) 6388 splice_config = config.get("tools", {}).get("splice", {}) 6389 if not splice_config: 6390 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6391 msg_err = "No Splice tool config" 6392 raise ValueError(msg_err) 6393 log.debug(f"splice_config: {splice_config}") 6394 6395 # Config - Folders - Databases 6396 databases_folders = ( 6397 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6398 ) 6399 log.debug("Databases annotations: " + str(databases_folders)) 6400 6401 # Splice docker image 6402 splice_docker_image = splice_config.get("docker").get("image") 6403 6404 # Pull splice image if it's not already there 6405 if not check_docker_image_exists(splice_docker_image): 6406 log.warning( 6407 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6408 ) 6409 try: 6410 command(f"docker pull {splice_config.get('docker').get('image')}") 6411 except subprocess.CalledProcessError: 6412 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6413 log.error(msg_err) 6414 raise ValueError(msg_err) 6415 6416 # Config - splice databases 6417 splice_databases = ( 6418 config.get("folders", {}) 6419 .get("databases", {}) 6420 .get("splice", DEFAULT_SPLICE_FOLDER) 6421 ) 6422 splice_databases = full_path(splice_databases) 6423 6424 # Param 6425 param = self.get_param() 6426 log.debug("Param: " + str(param)) 6427 6428 # Param 6429 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6430 log.debug("Options: " + str(options)) 6431 6432 # Data 6433 table_variants = self.get_table_variants() 6434 6435 # Check if not empty 6436 log.debug("Check if not empty") 6437 sql_query_chromosomes = ( 6438 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6439 ) 6440 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6441 log.info("VCF empty") 6442 return None 6443 6444 # Export in VCF 6445 log.debug("Create initial file to annotate") 6446 6447 # Create output folder / work folder 6448 if options.get("output_folder", ""): 6449 output_folder = options.get("output_folder", "") 6450 if not os.path.exists(output_folder): 6451 Path(output_folder).mkdir(parents=True, exist_ok=True) 6452 else: 6453 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6454 if not os.path.exists(output_folder): 6455 Path(output_folder).mkdir(parents=True, exist_ok=True) 6456 6457 if options.get("workdir", ""): 6458 workdir = options.get("workdir", "") 6459 else: 6460 workdir = "/work" 6461 6462 # Create tmp VCF file 6463 tmp_vcf = NamedTemporaryFile( 6464 prefix=self.get_prefix(), 6465 dir=output_folder, 6466 suffix=".vcf", 6467 delete=False, 6468 ) 6469 tmp_vcf_name = tmp_vcf.name 6470 6471 # VCF header 6472 header = self.get_header() 6473 6474 # Existing annotations 6475 for vcf_annotation in self.get_header().infos: 6476 6477 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6478 log.debug( 6479 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6480 ) 6481 6482 # Memory limit 6483 if config.get("memory", None): 6484 memory_limit = config.get("memory", "8G").upper() 6485 # upper() 6486 else: 6487 memory_limit = "8G" 6488 log.debug(f"memory_limit: {memory_limit}") 6489 6490 # Check number of variants to annotate 6491 where_clause_regex_spliceai = r"SpliceAI_\w+" 6492 where_clause_regex_spip = r"SPiP_\w+" 6493 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6494 df_list_of_variants_to_annotate = self.get_query_to_df( 6495 query=f""" SELECT * FROM variants {where_clause} """ 6496 ) 6497 if len(df_list_of_variants_to_annotate) == 0: 6498 log.warning( 6499 f"No variants to annotate with splice. Variants probably already annotated with splice" 6500 ) 6501 return None 6502 else: 6503 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6504 6505 # Export VCF file 6506 self.export_variant_vcf( 6507 vcf_file=tmp_vcf_name, 6508 remove_info=True, 6509 add_samples=True, 6510 index=False, 6511 where_clause=where_clause, 6512 ) 6513 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6514 if any(value for value in splice_config.values() if value is None): 6515 log.warning("At least one splice config parameter is empty") 6516 # exit annotation_splice 6517 return None 6518 6519 # Params in splice nf 6520 def check_values(dico: dict): 6521 """ 6522 Ensure parameters for NF splice pipeline 6523 """ 6524 for key, val in dico.items(): 6525 if key == "genome": 6526 if any( 6527 assemb in options.get("genome", {}) 6528 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6529 ): 6530 yield f"--{key} hg19" 6531 elif any( 6532 assemb in options.get("genome", {}) 6533 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6534 ): 6535 yield f"--{key} hg38" 6536 elif ( 6537 (isinstance(val, str) and val) 6538 or isinstance(val, int) 6539 or isinstance(val, bool) 6540 ): 6541 yield f"--{key} {val}" 6542 6543 # Genome 6544 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6545 options["genome"] = genome 6546 # NF params 6547 nf_params = [] 6548 # Add options 6549 if options: 6550 log.debug(options) 6551 nf_params = list(check_values(options)) 6552 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6553 else: 6554 log.debug("No NF params provided") 6555 # Add threads 6556 if "threads" not in options.keys(): 6557 nf_params.append(f"--threads {threads}") 6558 # Genome path 6559 genome_path = find_genome( 6560 config.get("folders", {}) 6561 .get("databases", {}) 6562 .get("genomes", DEFAULT_GENOME_FOLDER), 6563 file=f"{genome}.fa", 6564 ) 6565 # Add genome path 6566 if not genome_path: 6567 raise ValueError( 6568 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6569 ) 6570 else: 6571 log.debug(f"Genome: {genome_path}") 6572 nf_params.append(f"--genome_path {genome_path}") 6573 6574 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6575 """ 6576 Setting up updated databases for SPiP and SpliceAI 6577 """ 6578 6579 try: 6580 6581 # SpliceAI assembly transcriptome 6582 spliceai_assembly = os.path.join( 6583 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6584 options.get("genome"), 6585 "transcriptome", 6586 ) 6587 spip_assembly = options.get("genome") 6588 6589 spip = find( 6590 f"transcriptome_{spip_assembly}.RData", 6591 config.get("folders", {}).get("databases", {}).get("spip", {}), 6592 ) 6593 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6594 log.debug(f"SPiP annotations: {spip}") 6595 log.debug(f"SpliceAI annotations: {spliceai}") 6596 if spip and spliceai: 6597 return [ 6598 f"--spip_transcriptome {spip}", 6599 f"--spliceai_transcriptome {spliceai}", 6600 ] 6601 else: 6602 log.warning( 6603 "Can't find splice databases in configuration, use annotations file from image" 6604 ) 6605 except TypeError: 6606 log.warning( 6607 "Can't find splice databases in configuration, use annotations file from image" 6608 ) 6609 return [] 6610 6611 # Add options, check if transcriptome option have already beend provided 6612 if ( 6613 "spip_transcriptome" not in nf_params 6614 and "spliceai_transcriptome" not in nf_params 6615 ): 6616 splice_reference = splice_annotations(options, config) 6617 if splice_reference: 6618 nf_params.extend(splice_reference) 6619 # nf_params.append(f"--output_folder {output_folder}") 6620 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6621 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6622 log.debug(cmd) 6623 splice_config["docker"]["command"] = cmd 6624 6625 # Ensure proxy is set 6626 proxy = [ 6627 f"-e {var}={os.getenv(var)}" 6628 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6629 if os.getenv(var) is not None 6630 ] 6631 docker_cmd = get_bin_command( 6632 tool="splice", 6633 bin_type="docker", 6634 config=config, 6635 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6636 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6637 ) 6638 # print(docker_cmd) 6639 # exit() 6640 # Docker debug 6641 # if splice_config.get("rm_container"): 6642 # rm_container = "--rm" 6643 # else: 6644 # rm_container = "" 6645 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6646 log.debug(docker_cmd) 6647 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6648 log.debug(res.stdout) 6649 if res.stderr: 6650 log.error(res.stderr) 6651 res.check_returncode() 6652 # Update variants 6653 log.info("Annotation - Updating...") 6654 # Test find output vcf 6655 log.debug( 6656 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6657 ) 6658 output_vcf = [] 6659 # Wrong folder to look in 6660 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6661 if ( 6662 files 6663 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6664 ): 6665 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6666 # log.debug(os.listdir(options.get("output_folder"))) 6667 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6668 if not output_vcf: 6669 log.debug( 6670 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6671 ) 6672 else: 6673 # Get new header from annotated vcf 6674 log.debug(f"Initial header: {len(header.infos)} fields") 6675 # Create new header with splice infos 6676 new_vcf = Variants(input=output_vcf[0]) 6677 new_vcf_header = new_vcf.get_header().infos 6678 for keys, infos in new_vcf_header.items(): 6679 if keys not in header.infos.keys(): 6680 header.infos[keys] = infos 6681 log.debug(f"New header: {len(header.infos)} fields") 6682 log.debug(f"Splice tmp output: {output_vcf[0]}") 6683 self.update_from_vcf(output_vcf[0]) 6684 6685 # Remove file 6686 remove_if_exists(output_vcf) 6687 6688 ### 6689 # Prioritization 6690 ### 6691 6692 def get_config_default(self, name: str) -> dict: 6693 """ 6694 The function `get_config_default` returns a dictionary containing default configurations for 6695 various calculations and prioritizations. 6696 6697 :param name: The `get_config_default` function returns a dictionary containing default 6698 configurations for different calculations and prioritizations. The `name` parameter is used to 6699 specify which specific configuration to retrieve from the dictionary 6700 :type name: str 6701 :return: The function `get_config_default` returns a dictionary containing default configuration 6702 settings for different calculations and prioritizations. The specific configuration settings are 6703 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6704 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6705 returned. If there is no match, an empty dictionary is returned. 6706 """ 6707 6708 config_default = { 6709 "calculations": { 6710 "variant_chr_pos_alt_ref": { 6711 "type": "sql", 6712 "name": "variant_chr_pos_alt_ref", 6713 "description": "Create a variant ID with chromosome, position, alt and ref", 6714 "available": False, 6715 "output_column_name": "variant_chr_pos_alt_ref", 6716 "output_column_type": "String", 6717 "output_column_description": "variant ID with chromosome, position, alt and ref", 6718 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6719 "operation_info": True, 6720 }, 6721 "VARTYPE": { 6722 "type": "sql", 6723 "name": "VARTYPE", 6724 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6725 "available": True, 6726 "output_column_name": "VARTYPE", 6727 "output_column_type": "String", 6728 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6729 "operation_query": """ 6730 CASE 6731 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6732 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6733 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6734 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6735 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6736 ELSE 'UNDEFINED' 6737 END 6738 """, 6739 "info_fields": ["SVTYPE"], 6740 "operation_info": True, 6741 }, 6742 "snpeff_hgvs": { 6743 "type": "python", 6744 "name": "snpeff_hgvs", 6745 "description": "HGVS nomenclatures from snpEff annotation", 6746 "available": True, 6747 "function_name": "calculation_extract_snpeff_hgvs", 6748 "function_params": ["snpeff_hgvs", "ANN"], 6749 }, 6750 "snpeff_ann_explode": { 6751 "type": "python", 6752 "name": "snpeff_ann_explode", 6753 "description": "Explode snpEff annotations with uniquify values", 6754 "available": True, 6755 "function_name": "calculation_snpeff_ann_explode", 6756 "function_params": [False, "fields", "snpeff_", "ANN"], 6757 }, 6758 "snpeff_ann_explode_uniquify": { 6759 "type": "python", 6760 "name": "snpeff_ann_explode_uniquify", 6761 "description": "Explode snpEff annotations", 6762 "available": True, 6763 "function_name": "calculation_snpeff_ann_explode", 6764 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6765 }, 6766 "snpeff_ann_explode_json": { 6767 "type": "python", 6768 "name": "snpeff_ann_explode_json", 6769 "description": "Explode snpEff annotations in JSON format", 6770 "available": True, 6771 "function_name": "calculation_snpeff_ann_explode", 6772 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6773 }, 6774 "NOMEN": { 6775 "type": "python", 6776 "name": "NOMEN", 6777 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6778 "available": True, 6779 "function_name": "calculation_extract_nomen", 6780 "function_params": [], 6781 }, 6782 "FINDBYPIPELINE": { 6783 "type": "python", 6784 "name": "FINDBYPIPELINE", 6785 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6786 "available": True, 6787 "function_name": "calculation_find_by_pipeline", 6788 "function_params": ["findbypipeline"], 6789 }, 6790 "FINDBYSAMPLE": { 6791 "type": "python", 6792 "name": "FINDBYSAMPLE", 6793 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6794 "available": True, 6795 "function_name": "calculation_find_by_pipeline", 6796 "function_params": ["findbysample"], 6797 }, 6798 "GENOTYPECONCORDANCE": { 6799 "type": "python", 6800 "name": "GENOTYPECONCORDANCE", 6801 "description": "Concordance of genotype for multi caller VCF", 6802 "available": True, 6803 "function_name": "calculation_genotype_concordance", 6804 "function_params": [], 6805 }, 6806 "BARCODE": { 6807 "type": "python", 6808 "name": "BARCODE", 6809 "description": "BARCODE as VaRank tool", 6810 "available": True, 6811 "function_name": "calculation_barcode", 6812 "function_params": [], 6813 }, 6814 "BARCODEFAMILY": { 6815 "type": "python", 6816 "name": "BARCODEFAMILY", 6817 "description": "BARCODEFAMILY as VaRank tool", 6818 "available": True, 6819 "function_name": "calculation_barcode_family", 6820 "function_params": ["BCF"], 6821 }, 6822 "TRIO": { 6823 "type": "python", 6824 "name": "TRIO", 6825 "description": "Inheritance for a trio family", 6826 "available": True, 6827 "function_name": "calculation_trio", 6828 "function_params": [], 6829 }, 6830 "VAF": { 6831 "type": "python", 6832 "name": "VAF", 6833 "description": "Variant Allele Frequency (VAF) harmonization", 6834 "available": True, 6835 "function_name": "calculation_vaf_normalization", 6836 "function_params": [], 6837 }, 6838 "VAF_stats": { 6839 "type": "python", 6840 "name": "VAF_stats", 6841 "description": "Variant Allele Frequency (VAF) statistics", 6842 "available": True, 6843 "function_name": "calculation_genotype_stats", 6844 "function_params": ["VAF"], 6845 }, 6846 "DP_stats": { 6847 "type": "python", 6848 "name": "DP_stats", 6849 "description": "Depth (DP) statistics", 6850 "available": True, 6851 "function_name": "calculation_genotype_stats", 6852 "function_params": ["DP"], 6853 }, 6854 "variant_id": { 6855 "type": "python", 6856 "name": "variant_id", 6857 "description": "Variant ID generated from variant position and type", 6858 "available": True, 6859 "function_name": "calculation_variant_id", 6860 "function_params": [], 6861 }, 6862 "transcripts_json": { 6863 "type": "python", 6864 "name": "transcripts_json", 6865 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6866 "available": True, 6867 "function_name": "calculation_transcripts_annotation", 6868 "function_params": ["transcripts_json", None], 6869 }, 6870 "transcripts_ann": { 6871 "type": "python", 6872 "name": "transcripts_ann", 6873 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6874 "available": True, 6875 "function_name": "calculation_transcripts_annotation", 6876 "function_params": [None, "transcripts_ann"], 6877 }, 6878 "transcripts_annotations": { 6879 "type": "python", 6880 "name": "transcripts_annotations", 6881 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6882 "available": True, 6883 "function_name": "calculation_transcripts_annotation", 6884 "function_params": [None, None], 6885 }, 6886 "transcripts_prioritization": { 6887 "type": "python", 6888 "name": "transcripts_prioritization", 6889 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6890 "available": True, 6891 "function_name": "calculation_transcripts_prioritization", 6892 "function_params": [], 6893 }, 6894 "transcripts_export": { 6895 "type": "python", 6896 "name": "transcripts_export", 6897 "description": "Export transcripts table/view as a file (using param.json)", 6898 "available": True, 6899 "function_name": "calculation_transcripts_export", 6900 "function_params": [], 6901 }, 6902 }, 6903 "prioritizations": { 6904 "default": { 6905 "ANN2": [ 6906 { 6907 "type": "contains", 6908 "value": "HIGH", 6909 "score": 5, 6910 "flag": "PASS", 6911 "comment": [ 6912 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6913 ], 6914 }, 6915 { 6916 "type": "contains", 6917 "value": "MODERATE", 6918 "score": 3, 6919 "flag": "PASS", 6920 "comment": [ 6921 "A non-disruptive variant that might change protein effectiveness" 6922 ], 6923 }, 6924 { 6925 "type": "contains", 6926 "value": "LOW", 6927 "score": 0, 6928 "flag": "FILTERED", 6929 "comment": [ 6930 "Assumed to be mostly harmless or unlikely to change protein behavior" 6931 ], 6932 }, 6933 { 6934 "type": "contains", 6935 "value": "MODIFIER", 6936 "score": 0, 6937 "flag": "FILTERED", 6938 "comment": [ 6939 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6940 ], 6941 }, 6942 ], 6943 } 6944 }, 6945 } 6946 6947 return config_default.get(name, None) 6948 6949 def get_config_json( 6950 self, name: str, config_dict: dict = {}, config_file: str = None 6951 ) -> dict: 6952 """ 6953 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6954 default values, a dictionary, and a file. 6955 6956 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6957 the name of the configuration. It is used to identify and retrieve the configuration settings 6958 for a specific component or module 6959 :type name: str 6960 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6961 dictionary that allows you to provide additional configuration settings or overrides. When you 6962 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6963 the key is the configuration setting you want to override or 6964 :type config_dict: dict 6965 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6966 specify the path to a configuration file that contains additional settings. If provided, the 6967 function will read the contents of this file and update the configuration dictionary with the 6968 values found in the file, overriding any existing values with the 6969 :type config_file: str 6970 :return: The function `get_config_json` returns a dictionary containing the configuration 6971 settings. 6972 """ 6973 6974 # Create with default prioritizations 6975 config_default = self.get_config_default(name=name) 6976 configuration = config_default 6977 # log.debug(f"configuration={configuration}") 6978 6979 # Replace prioritizations from dict 6980 for config in config_dict: 6981 configuration[config] = config_dict[config] 6982 6983 # Replace prioritizations from file 6984 config_file = full_path(config_file) 6985 if config_file: 6986 if os.path.exists(config_file): 6987 with open(config_file) as config_file_content: 6988 config_file_dict = json.load(config_file_content) 6989 for config in config_file_dict: 6990 configuration[config] = config_file_dict[config] 6991 else: 6992 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6993 log.error(msg_error) 6994 raise ValueError(msg_error) 6995 6996 return configuration 6997 6998 def prioritization( 6999 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7000 ) -> bool: 7001 """ 7002 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7003 prioritizes variants based on configured profiles and criteria. 7004 7005 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7006 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7007 a table name is provided, the method will prioritize the variants in that specific table 7008 :type table: str 7009 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7010 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7011 provided, the code will use a default prefix value of "PZ" 7012 :type pz_prefix: str 7013 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7014 additional parameters specific to the prioritization process. These parameters can include 7015 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7016 configurations needed for the prioritization of variants in a V 7017 :type pz_param: dict 7018 :return: A boolean value (True) is being returned from the `prioritization` function. 7019 """ 7020 7021 # Config 7022 config = self.get_config() 7023 7024 # Param 7025 param = self.get_param() 7026 7027 # Prioritization param 7028 if pz_param is not None: 7029 prioritization_param = pz_param 7030 else: 7031 prioritization_param = param.get("prioritization", {}) 7032 7033 # Configuration profiles 7034 prioritization_config_file = prioritization_param.get( 7035 "prioritization_config", None 7036 ) 7037 prioritization_config_file = full_path(prioritization_config_file) 7038 prioritizations_config = self.get_config_json( 7039 name="prioritizations", config_file=prioritization_config_file 7040 ) 7041 7042 # Prioritization prefix 7043 pz_prefix_default = "PZ" 7044 if pz_prefix is None: 7045 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7046 7047 # Prioritization options 7048 profiles = prioritization_param.get("profiles", []) 7049 if isinstance(profiles, str): 7050 profiles = profiles.split(",") 7051 pzfields = prioritization_param.get( 7052 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7053 ) 7054 if isinstance(pzfields, str): 7055 pzfields = pzfields.split(",") 7056 default_profile = prioritization_param.get("default_profile", None) 7057 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7058 prioritization_score_mode = prioritization_param.get( 7059 "prioritization_score_mode", "HOWARD" 7060 ) 7061 7062 # Quick Prioritizations 7063 prioritizations = param.get("prioritizations", None) 7064 if prioritizations: 7065 log.info("Quick Prioritization:") 7066 for profile in prioritizations.split(","): 7067 if profile not in profiles: 7068 profiles.append(profile) 7069 log.info(f" {profile}") 7070 7071 # If profile "ALL" provided, all profiles in the config profiles 7072 if "ALL" in profiles: 7073 profiles = list(prioritizations_config.keys()) 7074 7075 for profile in profiles: 7076 if prioritizations_config.get(profile, None): 7077 log.debug(f"Profile '{profile}' configured") 7078 else: 7079 msg_error = f"Profile '{profile}' NOT configured" 7080 log.error(msg_error) 7081 raise ValueError(msg_error) 7082 7083 if profiles: 7084 log.info(f"Prioritization... ") 7085 else: 7086 log.debug(f"No profile defined") 7087 return False 7088 7089 if not default_profile and len(profiles): 7090 default_profile = profiles[0] 7091 7092 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7093 log.debug("Profiles to check: " + str(list(profiles))) 7094 7095 # Variables 7096 if table is not None: 7097 table_variants = table 7098 else: 7099 table_variants = self.get_table_variants(clause="update") 7100 log.debug(f"Table to prioritize: {table_variants}") 7101 7102 # Added columns 7103 added_columns = [] 7104 7105 # Create list of PZfields 7106 # List of PZFields 7107 list_of_pzfields_original = pzfields + [ 7108 pzfield + pzfields_sep + profile 7109 for pzfield in pzfields 7110 for profile in profiles 7111 ] 7112 list_of_pzfields = [] 7113 log.debug(f"{list_of_pzfields_original}") 7114 7115 # Remove existing PZfields to use if exists 7116 for pzfield in list_of_pzfields_original: 7117 if self.get_header().infos.get(pzfield, None) is None: 7118 list_of_pzfields.append(pzfield) 7119 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7120 else: 7121 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7122 7123 if list_of_pzfields: 7124 7125 # Explode Infos prefix 7126 explode_infos_prefix = self.get_explode_infos_prefix() 7127 7128 # PZfields tags description 7129 PZfields_INFOS = { 7130 f"{pz_prefix}Tags": { 7131 "ID": f"{pz_prefix}Tags", 7132 "Number": ".", 7133 "Type": "String", 7134 "Description": "Variant tags based on annotation criteria", 7135 }, 7136 f"{pz_prefix}Score": { 7137 "ID": f"{pz_prefix}Score", 7138 "Number": 1, 7139 "Type": "Integer", 7140 "Description": "Variant score based on annotation criteria", 7141 }, 7142 f"{pz_prefix}Flag": { 7143 "ID": f"{pz_prefix}Flag", 7144 "Number": 1, 7145 "Type": "String", 7146 "Description": "Variant flag based on annotation criteria", 7147 }, 7148 f"{pz_prefix}Comment": { 7149 "ID": f"{pz_prefix}Comment", 7150 "Number": ".", 7151 "Type": "String", 7152 "Description": "Variant comment based on annotation criteria", 7153 }, 7154 f"{pz_prefix}Infos": { 7155 "ID": f"{pz_prefix}Infos", 7156 "Number": ".", 7157 "Type": "String", 7158 "Description": "Variant infos based on annotation criteria", 7159 }, 7160 f"{pz_prefix}Class": { 7161 "ID": f"{pz_prefix}Class", 7162 "Number": ".", 7163 "Type": "String", 7164 "Description": "Variant class based on annotation criteria", 7165 }, 7166 } 7167 7168 # Create INFO fields if not exist 7169 for field in PZfields_INFOS: 7170 field_ID = PZfields_INFOS[field]["ID"] 7171 field_description = PZfields_INFOS[field]["Description"] 7172 if field_ID not in self.get_header().infos and field_ID in pzfields: 7173 field_description = ( 7174 PZfields_INFOS[field]["Description"] 7175 + f", profile {default_profile}" 7176 ) 7177 self.get_header().infos[field_ID] = vcf.parser._Info( 7178 field_ID, 7179 PZfields_INFOS[field]["Number"], 7180 PZfields_INFOS[field]["Type"], 7181 field_description, 7182 "unknown", 7183 "unknown", 7184 code_type_map[PZfields_INFOS[field]["Type"]], 7185 ) 7186 7187 # Create INFO fields if not exist for each profile 7188 for profile in prioritizations_config: 7189 if profile in profiles or profiles == []: 7190 for field in PZfields_INFOS: 7191 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7192 field_description = ( 7193 PZfields_INFOS[field]["Description"] 7194 + f", profile {profile}" 7195 ) 7196 if ( 7197 field_ID not in self.get_header().infos 7198 and field in pzfields 7199 ): 7200 self.get_header().infos[field_ID] = vcf.parser._Info( 7201 field_ID, 7202 PZfields_INFOS[field]["Number"], 7203 PZfields_INFOS[field]["Type"], 7204 field_description, 7205 "unknown", 7206 "unknown", 7207 code_type_map[PZfields_INFOS[field]["Type"]], 7208 ) 7209 7210 # Header 7211 for pzfield in list_of_pzfields: 7212 if re.match(f"{pz_prefix}Score.*", pzfield): 7213 added_column = self.add_column( 7214 table_name=table_variants, 7215 column_name=pzfield, 7216 column_type="INTEGER", 7217 default_value="0", 7218 ) 7219 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7220 added_column = self.add_column( 7221 table_name=table_variants, 7222 column_name=pzfield, 7223 column_type="BOOLEAN", 7224 default_value="1", 7225 ) 7226 elif re.match(f"{pz_prefix}Class.*", pzfield): 7227 added_column = self.add_column( 7228 table_name=table_variants, 7229 column_name=pzfield, 7230 column_type="VARCHAR[]", 7231 default_value="null", 7232 ) 7233 else: 7234 added_column = self.add_column( 7235 table_name=table_variants, 7236 column_name=pzfield, 7237 column_type="STRING", 7238 default_value="''", 7239 ) 7240 added_columns.append(added_column) 7241 7242 # Profiles 7243 if profiles: 7244 7245 # foreach profile in configuration file 7246 for profile in prioritizations_config: 7247 7248 # If profile is asked in param, or ALL are asked (empty profile []) 7249 if profile in profiles or profiles == []: 7250 log.info(f"Profile '{profile}'") 7251 7252 sql_set_info_option = "" 7253 7254 sql_set_info = [] 7255 7256 # PZ fields set 7257 7258 # PZScore 7259 if ( 7260 f"{pz_prefix}Score{pzfields_sep}{profile}" 7261 in list_of_pzfields 7262 ): 7263 sql_set_info.append( 7264 f""" 7265 concat( 7266 '{pz_prefix}Score{pzfields_sep}{profile}=', 7267 {pz_prefix}Score{pzfields_sep}{profile} 7268 ) 7269 """ 7270 ) 7271 if ( 7272 profile == default_profile 7273 and f"{pz_prefix}Score" in list_of_pzfields 7274 ): 7275 sql_set_info.append( 7276 f""" 7277 concat( 7278 '{pz_prefix}Score=', 7279 {pz_prefix}Score{pzfields_sep}{profile} 7280 ) 7281 """ 7282 ) 7283 7284 # PZFlag 7285 if ( 7286 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7287 in list_of_pzfields 7288 ): 7289 sql_set_info.append( 7290 f""" 7291 concat( 7292 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7293 CASE 7294 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7295 THEN 'PASS' 7296 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7297 THEN 'FILTERED' 7298 END 7299 ) 7300 """ 7301 ) 7302 if ( 7303 profile == default_profile 7304 and f"{pz_prefix}Flag" in list_of_pzfields 7305 ): 7306 sql_set_info.append( 7307 f""" 7308 concat( 7309 '{pz_prefix}Flag=', 7310 CASE 7311 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7312 THEN 'PASS' 7313 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7314 THEN 'FILTERED' 7315 END 7316 ) 7317 """ 7318 ) 7319 7320 # PZClass 7321 if ( 7322 f"{pz_prefix}Class{pzfields_sep}{profile}" 7323 in list_of_pzfields 7324 ): 7325 sql_set_info.append( 7326 f""" 7327 concat( 7328 '{pz_prefix}Class{pzfields_sep}{profile}=', 7329 CASE 7330 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7331 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7332 ELSE '.' 7333 END 7334 ) 7335 7336 """ 7337 ) 7338 if ( 7339 profile == default_profile 7340 and f"{pz_prefix}Class" in list_of_pzfields 7341 ): 7342 sql_set_info.append( 7343 f""" 7344 concat( 7345 '{pz_prefix}Class=', 7346 CASE 7347 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7348 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7349 ELSE '.' 7350 END 7351 ) 7352 """ 7353 ) 7354 7355 # PZComment 7356 if ( 7357 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7358 in list_of_pzfields 7359 ): 7360 sql_set_info.append( 7361 f""" 7362 CASE 7363 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7364 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7365 ELSE '' 7366 END 7367 """ 7368 ) 7369 if ( 7370 profile == default_profile 7371 and f"{pz_prefix}Comment" in list_of_pzfields 7372 ): 7373 sql_set_info.append( 7374 f""" 7375 CASE 7376 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7377 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7378 ELSE '' 7379 END 7380 """ 7381 ) 7382 7383 # PZInfos 7384 if ( 7385 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7386 in list_of_pzfields 7387 ): 7388 sql_set_info.append( 7389 f""" 7390 CASE 7391 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7392 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7393 ELSE '' 7394 END 7395 """ 7396 ) 7397 if ( 7398 profile == default_profile 7399 and f"{pz_prefix}Infos" in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 CASE 7404 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7405 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7406 ELSE '' 7407 END 7408 """ 7409 ) 7410 7411 # Merge PZfields 7412 sql_set_info_option = "" 7413 sql_set_sep = "" 7414 for sql_set in sql_set_info: 7415 if sql_set_sep: 7416 sql_set_info_option += f""" 7417 , concat('{sql_set_sep}', {sql_set}) 7418 """ 7419 else: 7420 sql_set_info_option += f""" 7421 , {sql_set} 7422 """ 7423 sql_set_sep = ";" 7424 7425 sql_queries = [] 7426 for annotation in prioritizations_config[profile]: 7427 7428 # skip special sections 7429 if annotation.startswith("_"): 7430 continue 7431 7432 # For each criterions 7433 for criterion in prioritizations_config[profile][ 7434 annotation 7435 ]: 7436 7437 # Criterion mode 7438 criterion_mode = None 7439 if np.any( 7440 np.isin(list(criterion.keys()), ["type", "value"]) 7441 ): 7442 criterion_mode = "operation" 7443 elif np.any( 7444 np.isin(list(criterion.keys()), ["sql", "fields"]) 7445 ): 7446 criterion_mode = "sql" 7447 log.debug(f"Criterion Mode: {criterion_mode}") 7448 7449 # Criterion parameters 7450 criterion_type = criterion.get("type", None) 7451 criterion_value = criterion.get("value", None) 7452 criterion_sql = criterion.get("sql", None) 7453 criterion_fields = criterion.get("fields", None) 7454 criterion_score = criterion.get("score", 0) 7455 criterion_flag = criterion.get("flag", "PASS") 7456 criterion_class = criterion.get("class", None) 7457 criterion_flag_bool = criterion_flag == "PASS" 7458 criterion_comment = ( 7459 ", ".join(criterion.get("comment", [])) 7460 .replace("'", "''") 7461 .replace(";", ",") 7462 .replace("\t", " ") 7463 ) 7464 criterion_infos = ( 7465 str(criterion) 7466 .replace("'", "''") 7467 .replace(";", ",") 7468 .replace("\t", " ") 7469 ) 7470 7471 # SQL 7472 if criterion_sql is not None and isinstance( 7473 criterion_sql, list 7474 ): 7475 criterion_sql = " ".join(criterion_sql) 7476 7477 # Fields and explode 7478 if criterion_fields is None: 7479 criterion_fields = [annotation] 7480 if not isinstance(criterion_fields, list): 7481 criterion_fields = str(criterion_fields).split(",") 7482 7483 # Class 7484 if criterion_class is not None and not isinstance( 7485 criterion_class, list 7486 ): 7487 criterion_class = str(criterion_class).split(",") 7488 7489 for annotation_field in criterion_fields: 7490 7491 # Explode specific annotation 7492 log.debug( 7493 f"Explode annotation '{annotation_field}'" 7494 ) 7495 added_columns += self.explode_infos( 7496 prefix=explode_infos_prefix, 7497 fields=[annotation_field], 7498 table=table_variants, 7499 ) 7500 extra_infos = self.get_extra_infos( 7501 table=table_variants 7502 ) 7503 7504 # Check if annotation field is present 7505 if ( 7506 f"{explode_infos_prefix}{annotation_field}" 7507 not in extra_infos 7508 ): 7509 msq_err = f"Annotation '{annotation_field}' not in data" 7510 log.error(msq_err) 7511 raise ValueError(msq_err) 7512 else: 7513 log.debug( 7514 f"Annotation '{annotation_field}' in data" 7515 ) 7516 7517 sql_set = [] 7518 sql_set_info = [] 7519 7520 # PZ fields set 7521 7522 # PZScore 7523 if ( 7524 f"{pz_prefix}Score{pzfields_sep}{profile}" 7525 in list_of_pzfields 7526 ): 7527 # if prioritization_score_mode == "HOWARD": 7528 # sql_set.append( 7529 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7530 # ) 7531 # VaRank prioritization score mode 7532 if prioritization_score_mode == "VaRank": 7533 sql_set.append( 7534 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7535 ) 7536 # default HOWARD prioritization score mode 7537 else: 7538 sql_set.append( 7539 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7540 ) 7541 7542 # PZFlag 7543 if ( 7544 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7545 in list_of_pzfields 7546 ): 7547 sql_set.append( 7548 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7549 ) 7550 7551 # PZClass 7552 if ( 7553 f"{pz_prefix}Class{pzfields_sep}{profile}" 7554 in list_of_pzfields 7555 and criterion_class is not None 7556 ): 7557 sql_set.append( 7558 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7559 ) 7560 7561 # PZComment 7562 if ( 7563 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7564 in list_of_pzfields 7565 ): 7566 sql_set.append( 7567 f""" 7568 {pz_prefix}Comment{pzfields_sep}{profile} = 7569 concat( 7570 {pz_prefix}Comment{pzfields_sep}{profile}, 7571 CASE 7572 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7573 THEN ', ' 7574 ELSE '' 7575 END, 7576 '{criterion_comment}' 7577 ) 7578 """ 7579 ) 7580 7581 # PZInfos 7582 if ( 7583 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7584 in list_of_pzfields 7585 ): 7586 sql_set.append( 7587 f""" 7588 {pz_prefix}Infos{pzfields_sep}{profile} = 7589 concat( 7590 {pz_prefix}Infos{pzfields_sep}{profile}, 7591 '{criterion_infos}' 7592 ) 7593 """ 7594 ) 7595 sql_set_option = ",".join(sql_set) 7596 7597 # Criterion and comparison 7598 if sql_set_option: 7599 7600 if criterion_mode in ["operation"]: 7601 7602 try: 7603 float(criterion_value) 7604 sql_update = f""" 7605 UPDATE {table_variants} 7606 SET {sql_set_option} 7607 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7608 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7609 """ 7610 except: 7611 contains_option = "" 7612 if criterion_type == "contains": 7613 contains_option = ".*" 7614 sql_update = f""" 7615 UPDATE {table_variants} 7616 SET {sql_set_option} 7617 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7618 """ 7619 sql_queries.append(sql_update) 7620 7621 elif criterion_mode in ["sql"]: 7622 7623 sql_update = f""" 7624 UPDATE {table_variants} 7625 SET {sql_set_option} 7626 WHERE {criterion_sql} 7627 """ 7628 sql_queries.append(sql_update) 7629 7630 else: 7631 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7632 log.error(msg_err) 7633 raise ValueError(msg_err) 7634 7635 else: 7636 log.warning( 7637 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7638 ) 7639 7640 # PZTags 7641 if ( 7642 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7643 in list_of_pzfields 7644 ): 7645 7646 # Create PZFalgs value 7647 pztags_value = "" 7648 pztags_sep_default = "," 7649 pztags_sep = "" 7650 for pzfield in pzfields: 7651 if pzfield not in [f"{pz_prefix}Tags"]: 7652 if ( 7653 f"{pzfield}{pzfields_sep}{profile}" 7654 in list_of_pzfields 7655 ): 7656 if pzfield in [f"{pz_prefix}Flag"]: 7657 pztags_value += f"""{pztags_sep}{pzfield}#', 7658 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7659 THEN 'PASS' 7660 ELSE 'FILTERED' 7661 END, '""" 7662 elif pzfield in [f"{pz_prefix}Class"]: 7663 pztags_value += f"""{pztags_sep}{pzfield}#', 7664 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7665 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7666 ELSE '.' 7667 END, '""" 7668 else: 7669 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7670 pztags_sep = pztags_sep_default 7671 7672 # Add Query update for PZFlags 7673 sql_update_pztags = f""" 7674 UPDATE {table_variants} 7675 SET INFO = concat( 7676 INFO, 7677 CASE WHEN INFO NOT in ('','.') 7678 THEN ';' 7679 ELSE '' 7680 END, 7681 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7682 ) 7683 """ 7684 sql_queries.append(sql_update_pztags) 7685 7686 # Add Query update for PZFlags for default 7687 if profile == default_profile: 7688 sql_update_pztags_default = f""" 7689 UPDATE {table_variants} 7690 SET INFO = concat( 7691 INFO, 7692 ';', 7693 '{pz_prefix}Tags={pztags_value}' 7694 ) 7695 """ 7696 sql_queries.append(sql_update_pztags_default) 7697 7698 log.info(f"""Profile '{profile}' - Prioritization... """) 7699 7700 if sql_queries: 7701 7702 for sql_query in sql_queries: 7703 log.debug( 7704 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7705 ) 7706 self.conn.execute(sql_query) 7707 7708 log.info(f"""Profile '{profile}' - Update... """) 7709 sql_query_update = f""" 7710 UPDATE {table_variants} 7711 SET INFO = 7712 concat( 7713 CASE 7714 WHEN INFO NOT IN ('','.') 7715 THEN concat(INFO, ';') 7716 ELSE '' 7717 END 7718 {sql_set_info_option} 7719 ) 7720 """ 7721 self.conn.execute(sql_query_update) 7722 7723 else: 7724 7725 log.warning(f"No profiles in parameters") 7726 7727 # Remove added columns 7728 for added_column in added_columns: 7729 self.drop_column(column=added_column) 7730 7731 # Explode INFOS fields into table fields 7732 if self.get_explode_infos(): 7733 self.explode_infos( 7734 prefix=self.get_explode_infos_prefix(), 7735 fields=self.get_explode_infos_fields(), 7736 force=True, 7737 ) 7738 7739 return True 7740 7741 ### 7742 # HGVS 7743 ### 7744 7745 def annotation_hgvs(self, threads: int = None) -> None: 7746 """ 7747 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7748 coordinates and alleles. 7749 7750 :param threads: The `threads` parameter is an optional integer that specifies the number of 7751 threads to use for parallel processing. If no value is provided, it will default to the number 7752 of threads obtained from the `get_threads()` method 7753 :type threads: int 7754 """ 7755 7756 # Function for each partition of the Dask Dataframe 7757 def partition_function(partition): 7758 """ 7759 The function `partition_function` applies the `annotation_hgvs_partition` function to 7760 each row of a DataFrame called `partition`. 7761 7762 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7763 to be processed 7764 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7765 the "partition" dataframe along the axis 1. 7766 """ 7767 return partition.apply(annotation_hgvs_partition, axis=1) 7768 7769 def annotation_hgvs_partition(row) -> str: 7770 """ 7771 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7772 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7773 7774 :param row: A dictionary-like object that contains the values for the following keys: 7775 :return: a string that contains the HGVS names associated with the given row of data. 7776 """ 7777 7778 chr = row["CHROM"] 7779 pos = row["POS"] 7780 ref = row["REF"] 7781 alt = row["ALT"] 7782 7783 # Find list of associated transcripts 7784 transcripts_list = list( 7785 polars_conn.execute( 7786 f""" 7787 SELECT transcript 7788 FROM refseq_df 7789 WHERE CHROM='{chr}' 7790 AND POS={pos} 7791 """ 7792 )["transcript"] 7793 ) 7794 7795 # Full HGVS annotation in list 7796 hgvs_full_list = [] 7797 7798 for transcript_name in transcripts_list: 7799 7800 # Transcript 7801 transcript = get_transcript( 7802 transcripts=transcripts, transcript_name=transcript_name 7803 ) 7804 # Exon 7805 if use_exon: 7806 exon = transcript.find_exon_number(pos) 7807 else: 7808 exon = None 7809 # Protein 7810 transcript_protein = None 7811 if use_protein or add_protein or full_format: 7812 transcripts_protein = list( 7813 polars_conn.execute( 7814 f""" 7815 SELECT protein 7816 FROM refseqlink_df 7817 WHERE transcript='{transcript_name}' 7818 LIMIT 1 7819 """ 7820 )["protein"] 7821 ) 7822 if len(transcripts_protein): 7823 transcript_protein = transcripts_protein[0] 7824 7825 # HGVS name 7826 hgvs_name = format_hgvs_name( 7827 chr, 7828 pos, 7829 ref, 7830 alt, 7831 genome=genome, 7832 transcript=transcript, 7833 transcript_protein=transcript_protein, 7834 exon=exon, 7835 use_gene=use_gene, 7836 use_protein=use_protein, 7837 full_format=full_format, 7838 use_version=use_version, 7839 codon_type=codon_type, 7840 ) 7841 hgvs_full_list.append(hgvs_name) 7842 if add_protein and not use_protein and not full_format: 7843 hgvs_name = format_hgvs_name( 7844 chr, 7845 pos, 7846 ref, 7847 alt, 7848 genome=genome, 7849 transcript=transcript, 7850 transcript_protein=transcript_protein, 7851 exon=exon, 7852 use_gene=use_gene, 7853 use_protein=True, 7854 full_format=False, 7855 use_version=use_version, 7856 codon_type=codon_type, 7857 ) 7858 hgvs_full_list.append(hgvs_name) 7859 7860 # Create liste of HGVS annotations 7861 hgvs_full = ",".join(hgvs_full_list) 7862 7863 return hgvs_full 7864 7865 # Polars connexion 7866 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7867 7868 # Config 7869 config = self.get_config() 7870 7871 # Databases 7872 # Genome 7873 databases_genomes_folders = ( 7874 config.get("folders", {}) 7875 .get("databases", {}) 7876 .get("genomes", DEFAULT_GENOME_FOLDER) 7877 ) 7878 databases_genome = ( 7879 config.get("folders", {}).get("databases", {}).get("genomes", "") 7880 ) 7881 # refseq database folder 7882 databases_refseq_folders = ( 7883 config.get("folders", {}) 7884 .get("databases", {}) 7885 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7886 ) 7887 # refseq 7888 databases_refseq = config.get("databases", {}).get("refSeq", None) 7889 # refSeqLink 7890 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7891 7892 # Param 7893 param = self.get_param() 7894 7895 # Quick HGVS 7896 if "hgvs_options" in param and param.get("hgvs_options", ""): 7897 log.info(f"Quick HGVS Annotation:") 7898 if not param.get("hgvs", None): 7899 param["hgvs"] = {} 7900 for option in param.get("hgvs_options", "").split(","): 7901 option_var_val = option.split("=") 7902 option_var = option_var_val[0] 7903 if len(option_var_val) > 1: 7904 option_val = option_var_val[1] 7905 else: 7906 option_val = "True" 7907 if option_val.upper() in ["TRUE"]: 7908 option_val = True 7909 elif option_val.upper() in ["FALSE"]: 7910 option_val = False 7911 log.info(f" {option_var}={option_val}") 7912 param["hgvs"][option_var] = option_val 7913 7914 # Check if HGVS annotation enabled 7915 if "hgvs" in param: 7916 log.info(f"HGVS Annotation... ") 7917 for hgvs_option in param.get("hgvs", {}): 7918 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7919 else: 7920 return 7921 7922 # HGVS Param 7923 param_hgvs = param.get("hgvs", {}) 7924 use_exon = param_hgvs.get("use_exon", False) 7925 use_gene = param_hgvs.get("use_gene", False) 7926 use_protein = param_hgvs.get("use_protein", False) 7927 add_protein = param_hgvs.get("add_protein", False) 7928 full_format = param_hgvs.get("full_format", False) 7929 use_version = param_hgvs.get("use_version", False) 7930 codon_type = param_hgvs.get("codon_type", "3") 7931 7932 # refSseq refSeqLink 7933 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7934 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7935 7936 # Assembly 7937 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7938 7939 # Genome 7940 genome_file = None 7941 if find_genome(databases_genome): 7942 genome_file = find_genome(databases_genome) 7943 else: 7944 genome_file = find_genome( 7945 genome_path=databases_genomes_folders, assembly=assembly 7946 ) 7947 log.debug("Genome: " + str(genome_file)) 7948 7949 # refSseq 7950 refseq_file = find_file_prefix( 7951 input_file=databases_refseq, 7952 prefix="ncbiRefSeq", 7953 folder=databases_refseq_folders, 7954 assembly=assembly, 7955 ) 7956 log.debug("refSeq: " + str(refseq_file)) 7957 7958 # refSeqLink 7959 refseqlink_file = find_file_prefix( 7960 input_file=databases_refseqlink, 7961 prefix="ncbiRefSeqLink", 7962 folder=databases_refseq_folders, 7963 assembly=assembly, 7964 ) 7965 log.debug("refSeqLink: " + str(refseqlink_file)) 7966 7967 # Threads 7968 if not threads: 7969 threads = self.get_threads() 7970 log.debug("Threads: " + str(threads)) 7971 7972 # Variables 7973 table_variants = self.get_table_variants(clause="update") 7974 7975 # Get variants SNV and InDel only 7976 query_variants = f""" 7977 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7978 FROM {table_variants} 7979 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7980 """ 7981 df_variants = self.get_query_to_df(query_variants) 7982 7983 # Added columns 7984 added_columns = [] 7985 7986 # Add hgvs column in variants table 7987 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7988 added_column = self.add_column( 7989 table_variants, hgvs_column_name, "STRING", default_value=None 7990 ) 7991 added_columns.append(added_column) 7992 7993 log.debug(f"refSeq loading...") 7994 # refSeq in duckDB 7995 refseq_table = get_refseq_table( 7996 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7997 ) 7998 # Loading all refSeq in Dataframe 7999 refseq_query = f""" 8000 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8001 FROM {refseq_table} 8002 JOIN df_variants ON ( 8003 {refseq_table}.chrom = df_variants.CHROM 8004 AND {refseq_table}.txStart<=df_variants.POS 8005 AND {refseq_table}.txEnd>=df_variants.POS 8006 ) 8007 """ 8008 refseq_df = self.conn.query(refseq_query).pl() 8009 8010 if refseqlink_file: 8011 log.debug(f"refSeqLink loading...") 8012 # refSeqLink in duckDB 8013 refseqlink_table = get_refseq_table( 8014 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8015 ) 8016 # Loading all refSeqLink in Dataframe 8017 protacc_column = "protAcc_with_ver" 8018 mrnaacc_column = "mrnaAcc_with_ver" 8019 refseqlink_query = f""" 8020 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8021 FROM {refseqlink_table} 8022 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8023 WHERE protAcc_without_ver IS NOT NULL 8024 """ 8025 # Polars Dataframe 8026 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8027 8028 # Read RefSeq transcripts into a python dict/model. 8029 log.debug(f"Transcripts loading...") 8030 with tempfile.TemporaryDirectory() as tmpdir: 8031 transcripts_query = f""" 8032 COPY ( 8033 SELECT {refseq_table}.* 8034 FROM {refseq_table} 8035 JOIN df_variants ON ( 8036 {refseq_table}.chrom=df_variants.CHROM 8037 AND {refseq_table}.txStart<=df_variants.POS 8038 AND {refseq_table}.txEnd>=df_variants.POS 8039 ) 8040 ) 8041 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8042 """ 8043 self.conn.query(transcripts_query) 8044 with open(f"{tmpdir}/transcript.tsv") as infile: 8045 transcripts = read_transcripts(infile) 8046 8047 # Polars connexion 8048 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8049 8050 log.debug("Genome loading...") 8051 # Read genome sequence using pyfaidx. 8052 genome = Fasta(genome_file) 8053 8054 log.debug("Start annotation HGVS...") 8055 8056 # Create 8057 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8058 ddf = dd.from_pandas(df_variants, npartitions=threads) 8059 8060 # Use dask.dataframe.apply() to apply function on each partition 8061 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8062 8063 # Convert Dask DataFrame to Pandas Dataframe 8064 df = ddf.compute() 8065 8066 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8067 with tempfile.TemporaryDirectory() as tmpdir: 8068 df_parquet = os.path.join(tmpdir, "df.parquet") 8069 df.to_parquet(df_parquet) 8070 8071 # Update hgvs column 8072 update_variant_query = f""" 8073 UPDATE {table_variants} 8074 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8075 FROM read_parquet('{df_parquet}') as df 8076 WHERE variants."#CHROM" = df.CHROM 8077 AND variants.POS = df.POS 8078 AND variants.REF = df.REF 8079 AND variants.ALT = df.ALT 8080 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8081 """ 8082 self.execute_query(update_variant_query) 8083 8084 # Update INFO column 8085 sql_query_update = f""" 8086 UPDATE {table_variants} 8087 SET INFO = 8088 concat( 8089 CASE 8090 WHEN INFO NOT IN ('','.') 8091 THEN concat(INFO, ';') 8092 ELSE '' 8093 END, 8094 'hgvs=', 8095 {hgvs_column_name} 8096 ) 8097 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8098 """ 8099 self.execute_query(sql_query_update) 8100 8101 # Add header 8102 HGVS_INFOS = { 8103 "hgvs": { 8104 "ID": "hgvs", 8105 "Number": ".", 8106 "Type": "String", 8107 "Description": f"HGVS annotatation with HOWARD", 8108 } 8109 } 8110 8111 for field in HGVS_INFOS: 8112 field_ID = HGVS_INFOS[field]["ID"] 8113 field_description = HGVS_INFOS[field]["Description"] 8114 self.get_header().infos[field_ID] = vcf.parser._Info( 8115 field_ID, 8116 HGVS_INFOS[field]["Number"], 8117 HGVS_INFOS[field]["Type"], 8118 field_description, 8119 "unknown", 8120 "unknown", 8121 code_type_map[HGVS_INFOS[field]["Type"]], 8122 ) 8123 8124 # Remove added columns 8125 for added_column in added_columns: 8126 self.drop_column(column=added_column) 8127 8128 ### 8129 # Calculation 8130 ### 8131 8132 def get_operations_help( 8133 self, operations_config_dict: dict = {}, operations_config_file: str = None 8134 ) -> list: 8135 8136 # Init 8137 operations_help = [] 8138 8139 # operations 8140 operations = self.get_config_json( 8141 name="calculations", 8142 config_dict=operations_config_dict, 8143 config_file=operations_config_file, 8144 ) 8145 for op in operations: 8146 op_name = operations[op].get("name", op).upper() 8147 op_description = operations[op].get("description", op_name) 8148 op_available = operations[op].get("available", False) 8149 if op_available: 8150 operations_help.append(f" {op_name}: {op_description}") 8151 8152 # Sort operations 8153 operations_help.sort() 8154 8155 # insert header 8156 operations_help.insert(0, "Available calculation operations:") 8157 8158 # Return 8159 return operations_help 8160 8161 def calculation( 8162 self, 8163 operations: dict = {}, 8164 operations_config_dict: dict = {}, 8165 operations_config_file: str = None, 8166 ) -> None: 8167 """ 8168 It takes a list of operations, and for each operation, it checks if it's a python or sql 8169 operation, and then calls the appropriate function 8170 8171 param json example: 8172 "calculation": { 8173 "NOMEN": { 8174 "options": { 8175 "hgvs_field": "hgvs" 8176 }, 8177 "middle" : null 8178 } 8179 """ 8180 8181 # Param 8182 param = self.get_param() 8183 8184 # operations config 8185 operations_config = self.get_config_json( 8186 name="calculations", 8187 config_dict=operations_config_dict, 8188 config_file=operations_config_file, 8189 ) 8190 8191 # Upper keys 8192 operations_config = {k.upper(): v for k, v in operations_config.items()} 8193 8194 # Calculations 8195 8196 # Operations from param 8197 operations = param.get("calculation", {}).get("calculations", operations) 8198 8199 # Quick calculation - add 8200 if param.get("calculations", None): 8201 8202 # List of operations 8203 calculations_list = [ 8204 value.strip() for value in param.get("calculations", "").split(",") 8205 ] 8206 8207 # Log 8208 log.info(f"Quick Calculations:") 8209 for calculation_key in calculations_list: 8210 log.info(f" {calculation_key}") 8211 8212 # Create tmp operations (to keep operation order) 8213 operations_tmp = {} 8214 for calculation_operation in calculations_list: 8215 if calculation_operation.upper() not in operations_tmp: 8216 log.debug( 8217 f"{calculation_operation}.upper() not in {operations_tmp}" 8218 ) 8219 operations_tmp[calculation_operation.upper()] = {} 8220 add_value_into_dict( 8221 dict_tree=operations_tmp, 8222 sections=[ 8223 calculation_operation.upper(), 8224 ], 8225 value=operations.get(calculation_operation.upper(), {}), 8226 ) 8227 # Add operations already in param 8228 for calculation_operation in operations: 8229 if calculation_operation not in operations_tmp: 8230 operations_tmp[calculation_operation] = operations.get( 8231 calculation_operation, {} 8232 ) 8233 8234 # Update operations in param 8235 operations = operations_tmp 8236 8237 # Operations for calculation 8238 if not operations: 8239 operations = param.get("calculation", {}).get("calculations", {}) 8240 8241 if operations: 8242 log.info(f"Calculations...") 8243 8244 # For each operations 8245 for operation_name in operations: 8246 operation_name = operation_name.upper() 8247 if operation_name not in [""]: 8248 if operation_name in operations_config: 8249 log.info(f"Calculation '{operation_name}'") 8250 operation = operations_config[operation_name] 8251 operation_type = operation.get("type", "sql") 8252 if operation_type == "python": 8253 self.calculation_process_function( 8254 operation=operation, operation_name=operation_name 8255 ) 8256 elif operation_type == "sql": 8257 self.calculation_process_sql( 8258 operation=operation, operation_name=operation_name 8259 ) 8260 else: 8261 log.error( 8262 f"Operations config: Type '{operation_type}' NOT available" 8263 ) 8264 raise ValueError( 8265 f"Operations config: Type '{operation_type}' NOT available" 8266 ) 8267 else: 8268 log.error( 8269 f"Operations config: Calculation '{operation_name}' NOT available" 8270 ) 8271 raise ValueError( 8272 f"Operations config: Calculation '{operation_name}' NOT available" 8273 ) 8274 8275 # Explode INFOS fields into table fields 8276 if self.get_explode_infos(): 8277 self.explode_infos( 8278 prefix=self.get_explode_infos_prefix(), 8279 fields=self.get_explode_infos_fields(), 8280 force=True, 8281 ) 8282 8283 def calculation_process_sql( 8284 self, operation: dict, operation_name: str = "unknown" 8285 ) -> None: 8286 """ 8287 The `calculation_process_sql` function takes in a mathematical operation as a string and 8288 performs the operation, updating the specified table with the result. 8289 8290 :param operation: The `operation` parameter is a dictionary that contains information about the 8291 mathematical operation to be performed. It includes the following keys: 8292 :type operation: dict 8293 :param operation_name: The `operation_name` parameter is a string that represents the name of 8294 the mathematical operation being performed. It is used for logging and error handling purposes, 8295 defaults to unknown 8296 :type operation_name: str (optional) 8297 """ 8298 8299 # table variants 8300 table_variants = self.get_table_variants(clause="alter") 8301 8302 # Operation infos 8303 operation_name = operation.get("name", "unknown") 8304 log.debug(f"process sql {operation_name}") 8305 output_column_name = operation.get("output_column_name", operation_name) 8306 output_column_type = operation.get("output_column_type", "String") 8307 prefix = operation.get("explode_infos_prefix", "") 8308 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8309 output_column_description = operation.get( 8310 "output_column_description", f"{operation_name} operation" 8311 ) 8312 operation_query = operation.get("operation_query", None) 8313 if isinstance(operation_query, list): 8314 operation_query = " ".join(operation_query) 8315 operation_info_fields = operation.get("info_fields", []) 8316 operation_info_fields_check = operation.get("info_fields_check", False) 8317 operation_info = operation.get("operation_info", True) 8318 8319 if operation_query: 8320 8321 # Info fields check 8322 operation_info_fields_check_result = True 8323 if operation_info_fields_check: 8324 header_infos = self.get_header().infos 8325 for info_field in operation_info_fields: 8326 operation_info_fields_check_result = ( 8327 operation_info_fields_check_result 8328 and info_field in header_infos 8329 ) 8330 8331 # If info fields available 8332 if operation_info_fields_check_result: 8333 8334 # Added_columns 8335 added_columns = [] 8336 8337 # Create VCF header field 8338 vcf_reader = self.get_header() 8339 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8340 output_column_name, 8341 ".", 8342 output_column_type, 8343 output_column_description, 8344 "howard calculation", 8345 "0", 8346 self.code_type_map.get(output_column_type), 8347 ) 8348 8349 # Explode infos if needed 8350 log.debug(f"calculation_process_sql prefix {prefix}") 8351 added_columns += self.explode_infos( 8352 prefix=prefix, 8353 fields=[output_column_name] + operation_info_fields, 8354 force=True, 8355 ) 8356 8357 # Create column 8358 added_column = self.add_column( 8359 table_name=table_variants, 8360 column_name=prefix + output_column_name, 8361 column_type=output_column_type_sql, 8362 default_value="null", 8363 ) 8364 added_columns.append(added_column) 8365 8366 # Operation calculation 8367 try: 8368 8369 # Query to update calculation column 8370 sql_update = f""" 8371 UPDATE {table_variants} 8372 SET "{prefix}{output_column_name}" = ({operation_query}) 8373 """ 8374 self.conn.execute(sql_update) 8375 8376 # Add to INFO 8377 if operation_info: 8378 sql_update_info = f""" 8379 UPDATE {table_variants} 8380 SET "INFO" = 8381 concat( 8382 CASE 8383 WHEN "INFO" IS NOT NULL 8384 THEN concat("INFO", ';') 8385 ELSE '' 8386 END, 8387 '{output_column_name}=', 8388 "{prefix}{output_column_name}" 8389 ) 8390 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8391 """ 8392 self.conn.execute(sql_update_info) 8393 8394 except: 8395 log.error( 8396 f"Operations config: Calculation '{operation_name}' query failed" 8397 ) 8398 raise ValueError( 8399 f"Operations config: Calculation '{operation_name}' query failed" 8400 ) 8401 8402 # Remove added columns 8403 for added_column in added_columns: 8404 log.debug(f"added_column: {added_column}") 8405 self.drop_column(column=added_column) 8406 8407 else: 8408 log.error( 8409 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8410 ) 8411 raise ValueError( 8412 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8413 ) 8414 8415 else: 8416 log.error( 8417 f"Operations config: Calculation '{operation_name}' query NOT defined" 8418 ) 8419 raise ValueError( 8420 f"Operations config: Calculation '{operation_name}' query NOT defined" 8421 ) 8422 8423 def calculation_process_function( 8424 self, operation: dict, operation_name: str = "unknown" 8425 ) -> None: 8426 """ 8427 The `calculation_process_function` takes in an operation dictionary and performs the specified 8428 function with the given parameters. 8429 8430 :param operation: The `operation` parameter is a dictionary that contains information about the 8431 operation to be performed. It has the following keys: 8432 :type operation: dict 8433 :param operation_name: The `operation_name` parameter is a string that represents the name of 8434 the operation being performed. It is used for logging purposes, defaults to unknown 8435 :type operation_name: str (optional) 8436 """ 8437 8438 operation_name = operation["name"] 8439 log.debug(f"process sql {operation_name}") 8440 function_name = operation["function_name"] 8441 function_params = operation["function_params"] 8442 getattr(self, function_name)(*function_params) 8443 8444 def calculation_variant_id(self) -> None: 8445 """ 8446 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8447 updates the INFO field of a variants table with the variant ID. 8448 """ 8449 8450 # variant_id annotation field 8451 variant_id_tag = self.get_variant_id_column() 8452 added_columns = [variant_id_tag] 8453 8454 # variant_id hgvs tags" 8455 vcf_infos_tags = { 8456 variant_id_tag: "howard variant ID annotation", 8457 } 8458 8459 # Variants table 8460 table_variants = self.get_table_variants() 8461 8462 # Header 8463 vcf_reader = self.get_header() 8464 8465 # Add variant_id to header 8466 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8467 variant_id_tag, 8468 ".", 8469 "String", 8470 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8471 "howard calculation", 8472 "0", 8473 self.code_type_map.get("String"), 8474 ) 8475 8476 # Update 8477 sql_update = f""" 8478 UPDATE {table_variants} 8479 SET "INFO" = 8480 concat( 8481 CASE 8482 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8483 THEN '' 8484 ELSE concat("INFO", ';') 8485 END, 8486 '{variant_id_tag}=', 8487 "{variant_id_tag}" 8488 ) 8489 """ 8490 self.conn.execute(sql_update) 8491 8492 # Remove added columns 8493 for added_column in added_columns: 8494 self.drop_column(column=added_column) 8495 8496 def calculation_extract_snpeff_hgvs( 8497 self, 8498 snpeff_hgvs: str = "snpeff_hgvs", 8499 snpeff_field: str = "ANN", 8500 ) -> None: 8501 """ 8502 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8503 annotation field in a VCF file and adds them as a new column in the variants table. 8504 8505 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8506 function is used to specify the name of the column that will store the HGVS nomenclatures 8507 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8508 snpeff_hgvs 8509 :type snpeff_hgvs: str (optional) 8510 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8511 function represents the field in the VCF file that contains SnpEff annotations. This field is 8512 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8513 to ANN 8514 :type snpeff_field: str (optional) 8515 """ 8516 8517 # Snpeff hgvs tags 8518 vcf_infos_tags = { 8519 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8520 } 8521 8522 # Prefix 8523 prefix = self.get_explode_infos_prefix() 8524 if prefix: 8525 prefix = "INFO/" 8526 8527 # snpEff fields 8528 speff_ann_infos = prefix + snpeff_field 8529 speff_hgvs_infos = prefix + snpeff_hgvs 8530 8531 # Variants table 8532 table_variants = self.get_table_variants() 8533 8534 # Header 8535 vcf_reader = self.get_header() 8536 8537 # Add columns 8538 added_columns = [] 8539 8540 # Explode HGVS field in column 8541 added_columns += self.explode_infos(fields=[snpeff_field]) 8542 8543 if snpeff_field in vcf_reader.infos: 8544 8545 log.debug(vcf_reader.infos[snpeff_field]) 8546 8547 # Extract ANN header 8548 ann_description = vcf_reader.infos[snpeff_field].desc 8549 pattern = r"'(.+?)'" 8550 match = re.search(pattern, ann_description) 8551 if match: 8552 ann_header_match = match.group(1).split(" | ") 8553 ann_header_desc = {} 8554 for i in range(len(ann_header_match)): 8555 ann_header_info = "".join( 8556 char for char in ann_header_match[i] if char.isalnum() 8557 ) 8558 ann_header_desc[ann_header_info] = ann_header_match[i] 8559 if not ann_header_desc: 8560 raise ValueError("Invalid header description format") 8561 else: 8562 raise ValueError("Invalid header description format") 8563 8564 # Create variant id 8565 variant_id_column = self.get_variant_id_column() 8566 added_columns += [variant_id_column] 8567 8568 # Create dataframe 8569 dataframe_snpeff_hgvs = self.get_query_to_df( 8570 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8571 ) 8572 8573 # Create main NOMEN column 8574 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8575 speff_ann_infos 8576 ].apply( 8577 lambda x: extract_snpeff_hgvs( 8578 str(x), header=list(ann_header_desc.values()) 8579 ) 8580 ) 8581 8582 # Add snpeff_hgvs to header 8583 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8584 snpeff_hgvs, 8585 ".", 8586 "String", 8587 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8588 "howard calculation", 8589 "0", 8590 self.code_type_map.get("String"), 8591 ) 8592 8593 # Update 8594 sql_update = f""" 8595 UPDATE variants 8596 SET "INFO" = 8597 concat( 8598 CASE 8599 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8600 THEN '' 8601 ELSE concat("INFO", ';') 8602 END, 8603 CASE 8604 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8605 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8606 THEN concat( 8607 '{snpeff_hgvs}=', 8608 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8609 ) 8610 ELSE '' 8611 END 8612 ) 8613 FROM dataframe_snpeff_hgvs 8614 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8615 8616 """ 8617 self.conn.execute(sql_update) 8618 8619 # Delete dataframe 8620 del dataframe_snpeff_hgvs 8621 gc.collect() 8622 8623 else: 8624 8625 log.warning( 8626 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8627 ) 8628 8629 # Remove added columns 8630 for added_column in added_columns: 8631 self.drop_column(column=added_column) 8632 8633 def calculation_snpeff_ann_explode( 8634 self, 8635 uniquify: bool = True, 8636 output_format: str = "fields", 8637 output_prefix: str = "snpeff_", 8638 snpeff_field: str = "ANN", 8639 ) -> None: 8640 """ 8641 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8642 exploding the HGVS field and updating variant information accordingly. 8643 8644 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8645 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8646 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8647 defaults to True 8648 :type uniquify: bool (optional) 8649 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8650 function specifies the format in which the output annotations will be generated. It has a 8651 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8652 format, defaults to fields 8653 :type output_format: str (optional) 8654 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8655 method is used to specify the prefix that will be added to the output annotations generated 8656 during the calculation process. This prefix helps to differentiate the newly added annotations 8657 from existing ones in the output data. By default, the, defaults to ANN_ 8658 :type output_prefix: str (optional) 8659 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8660 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8661 field will be processed to explode the HGVS annotations and update the variant information 8662 accordingly, defaults to ANN 8663 :type snpeff_field: str (optional) 8664 """ 8665 8666 # SnpEff annotation field 8667 snpeff_hgvs = "snpeff_ann_explode" 8668 8669 # Snpeff hgvs tags 8670 vcf_infos_tags = { 8671 snpeff_hgvs: "Explode snpEff annotations", 8672 } 8673 8674 # Prefix 8675 prefix = self.get_explode_infos_prefix() 8676 if prefix: 8677 prefix = "INFO/" 8678 8679 # snpEff fields 8680 speff_ann_infos = prefix + snpeff_field 8681 speff_hgvs_infos = prefix + snpeff_hgvs 8682 8683 # Variants table 8684 table_variants = self.get_table_variants() 8685 8686 # Header 8687 vcf_reader = self.get_header() 8688 8689 # Add columns 8690 added_columns = [] 8691 8692 # Explode HGVS field in column 8693 added_columns += self.explode_infos(fields=[snpeff_field]) 8694 log.debug(f"snpeff_field={snpeff_field}") 8695 log.debug(f"added_columns={added_columns}") 8696 8697 if snpeff_field in vcf_reader.infos: 8698 8699 # Extract ANN header 8700 ann_description = vcf_reader.infos[snpeff_field].desc 8701 pattern = r"'(.+?)'" 8702 match = re.search(pattern, ann_description) 8703 if match: 8704 ann_header_match = match.group(1).split(" | ") 8705 ann_header = [] 8706 ann_header_desc = {} 8707 for i in range(len(ann_header_match)): 8708 ann_header_info = "".join( 8709 char for char in ann_header_match[i] if char.isalnum() 8710 ) 8711 ann_header.append(ann_header_info) 8712 ann_header_desc[ann_header_info] = ann_header_match[i] 8713 if not ann_header_desc: 8714 raise ValueError("Invalid header description format") 8715 else: 8716 raise ValueError("Invalid header description format") 8717 8718 # Create variant id 8719 variant_id_column = self.get_variant_id_column() 8720 added_columns += [variant_id_column] 8721 8722 # Create dataframe 8723 dataframe_snpeff_hgvs = self.get_query_to_df( 8724 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8725 ) 8726 8727 # Create snpEff columns 8728 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8729 speff_ann_infos 8730 ].apply( 8731 lambda x: explode_snpeff_ann( 8732 str(x), 8733 uniquify=uniquify, 8734 output_format=output_format, 8735 prefix=output_prefix, 8736 header=list(ann_header_desc.values()), 8737 ) 8738 ) 8739 8740 # Header 8741 ann_annotations_prefix = "" 8742 if output_format.upper() in ["JSON"]: 8743 ann_annotations_prefix = f"{output_prefix}=" 8744 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8745 output_prefix, 8746 ".", 8747 "String", 8748 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8749 + " - JSON format", 8750 "howard calculation", 8751 "0", 8752 self.code_type_map.get("String"), 8753 ) 8754 else: 8755 for ann_annotation in ann_header: 8756 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8757 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8758 ann_annotation_id, 8759 ".", 8760 "String", 8761 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8762 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8763 "howard calculation", 8764 "0", 8765 self.code_type_map.get("String"), 8766 ) 8767 8768 # Update 8769 sql_update = f""" 8770 UPDATE variants 8771 SET "INFO" = 8772 concat( 8773 CASE 8774 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8775 THEN '' 8776 ELSE concat("INFO", ';') 8777 END, 8778 CASE 8779 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8780 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8781 THEN concat( 8782 '{ann_annotations_prefix}', 8783 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8784 ) 8785 ELSE '' 8786 END 8787 ) 8788 FROM dataframe_snpeff_hgvs 8789 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8790 8791 """ 8792 self.conn.execute(sql_update) 8793 8794 # Delete dataframe 8795 del dataframe_snpeff_hgvs 8796 gc.collect() 8797 8798 else: 8799 8800 log.warning( 8801 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8802 ) 8803 8804 # Remove added columns 8805 for added_column in added_columns: 8806 self.drop_column(column=added_column) 8807 8808 def calculation_extract_nomen(self) -> None: 8809 """ 8810 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8811 """ 8812 8813 # NOMEN field 8814 field_nomen_dict = "NOMEN_DICT" 8815 8816 # NOMEN structure 8817 nomen_dict = { 8818 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8819 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8820 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8821 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8822 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8823 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8824 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8825 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8826 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8827 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8828 } 8829 8830 # Param 8831 param = self.get_param() 8832 8833 # Prefix 8834 prefix = self.get_explode_infos_prefix() 8835 8836 # Header 8837 vcf_reader = self.get_header() 8838 8839 # Added columns 8840 added_columns = [] 8841 8842 # Get HGVS field 8843 hgvs_field = ( 8844 param.get("calculation", {}) 8845 .get("calculations", {}) 8846 .get("NOMEN", {}) 8847 .get("options", {}) 8848 .get("hgvs_field", "hgvs") 8849 ) 8850 8851 # Get NOMEN pattern 8852 nomen_pattern = ( 8853 param.get("calculation", {}) 8854 .get("calculations", {}) 8855 .get("NOMEN", {}) 8856 .get("options", {}) 8857 .get("pattern", None) 8858 ) 8859 8860 # transcripts list of preference sources 8861 transcripts_sources = {} 8862 8863 # Get transcripts 8864 transcripts_file = ( 8865 param.get("calculation", {}) 8866 .get("calculations", {}) 8867 .get("NOMEN", {}) 8868 .get("options", {}) 8869 .get("transcripts", None) 8870 ) 8871 transcripts_file = full_path(transcripts_file) 8872 if transcripts_file: 8873 if os.path.exists(transcripts_file): 8874 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8875 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8876 transcripts_sources["file"] = transcripts_from_file 8877 else: 8878 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8879 log.error(msg_err) 8880 raise ValueError(msg_err) 8881 8882 # Get transcripts table 8883 transcripts_table = ( 8884 param.get("calculation", {}) 8885 .get("calculations", {}) 8886 .get("NOMEN", {}) 8887 .get("options", {}) 8888 .get("transcripts_table", self.get_table_variants()) 8889 ) 8890 # Get transcripts column 8891 transcripts_column = ( 8892 param.get("calculation", {}) 8893 .get("calculations", {}) 8894 .get("NOMEN", {}) 8895 .get("options", {}) 8896 .get("transcripts_column", None) 8897 ) 8898 8899 if transcripts_table and transcripts_column: 8900 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8901 # Explode if not exists 8902 self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8903 else: 8904 extra_field_transcript = f"NULL" 8905 8906 # Transcripts of preference source order 8907 transcripts_order = ( 8908 param.get("calculation", {}) 8909 .get("calculations", {}) 8910 .get("NOMEN", {}) 8911 .get("options", {}) 8912 .get("transcripts_order", ["column", "file"]) 8913 ) 8914 8915 # Transcripts from file 8916 transcripts = transcripts_sources.get("file", []) 8917 8918 # Explode HGVS field in column 8919 added_columns += self.explode_infos(fields=[hgvs_field]) 8920 8921 # extra infos 8922 extra_infos = self.get_extra_infos() 8923 extra_field = prefix + hgvs_field 8924 8925 if extra_field in extra_infos: 8926 8927 # Create dataframe 8928 dataframe_hgvs = self.get_query_to_df( 8929 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 8930 ) 8931 8932 # Create main NOMEN column 8933 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 8934 lambda x: find_nomen( 8935 hgvs=x.hgvs, 8936 transcript=x.transcript, 8937 transcripts=transcripts, 8938 pattern=nomen_pattern, 8939 transcripts_source_order=transcripts_order, 8940 ), 8941 axis=1, 8942 ) 8943 8944 # Explode NOMEN Structure and create SQL set for update 8945 sql_nomen_fields = [] 8946 for nomen_field in nomen_dict: 8947 8948 # Explode each field into a column 8949 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8950 lambda x: dict(x).get(nomen_field, "") 8951 ) 8952 8953 # Create VCF header field 8954 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8955 nomen_field, 8956 ".", 8957 "String", 8958 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8959 "howard calculation", 8960 "0", 8961 self.code_type_map.get("String"), 8962 ) 8963 sql_nomen_fields.append( 8964 f""" 8965 CASE 8966 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8967 THEN concat( 8968 ';{nomen_field}=', 8969 dataframe_hgvs."{nomen_field}" 8970 ) 8971 ELSE '' 8972 END 8973 """ 8974 ) 8975 8976 # SQL set for update 8977 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8978 8979 # Update 8980 sql_update = f""" 8981 UPDATE variants 8982 SET "INFO" = 8983 concat( 8984 CASE 8985 WHEN "INFO" IS NULL 8986 THEN '' 8987 ELSE "INFO" 8988 END, 8989 {sql_nomen_fields_set} 8990 ) 8991 FROM dataframe_hgvs 8992 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8993 AND variants."POS" = dataframe_hgvs."POS" 8994 AND variants."REF" = dataframe_hgvs."REF" 8995 AND variants."ALT" = dataframe_hgvs."ALT" 8996 """ 8997 self.conn.execute(sql_update) 8998 8999 # Delete dataframe 9000 del dataframe_hgvs 9001 gc.collect() 9002 9003 # Remove added columns 9004 for added_column in added_columns: 9005 self.drop_column(column=added_column) 9006 9007 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9008 """ 9009 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9010 pipeline/sample for a variant and updates the variant information in a VCF file. 9011 9012 :param tag: The `tag` parameter is a string that represents the annotation field for the 9013 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9014 VCF header and to update the corresponding field in the variants table, defaults to 9015 findbypipeline 9016 :type tag: str (optional) 9017 """ 9018 9019 # if FORMAT and samples 9020 if ( 9021 "FORMAT" in self.get_header_columns_as_list() 9022 and self.get_header_sample_list() 9023 ): 9024 9025 # findbypipeline annotation field 9026 findbypipeline_tag = tag 9027 9028 # VCF infos tags 9029 vcf_infos_tags = { 9030 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9031 } 9032 9033 # Prefix 9034 prefix = self.get_explode_infos_prefix() 9035 9036 # Field 9037 findbypipeline_infos = prefix + findbypipeline_tag 9038 9039 # Variants table 9040 table_variants = self.get_table_variants() 9041 9042 # Header 9043 vcf_reader = self.get_header() 9044 9045 # Create variant id 9046 variant_id_column = self.get_variant_id_column() 9047 added_columns = [variant_id_column] 9048 9049 # variant_id, FORMAT and samples 9050 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9051 self.get_header_sample_list() 9052 ) 9053 9054 # Create dataframe 9055 dataframe_findbypipeline = self.get_query_to_df( 9056 f""" SELECT {samples_fields} FROM {table_variants} """ 9057 ) 9058 9059 # Create findbypipeline column 9060 dataframe_findbypipeline[findbypipeline_infos] = ( 9061 dataframe_findbypipeline.apply( 9062 lambda row: findbypipeline( 9063 row, samples=self.get_header_sample_list() 9064 ), 9065 axis=1, 9066 ) 9067 ) 9068 9069 # Add snpeff_hgvs to header 9070 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9071 findbypipeline_tag, 9072 ".", 9073 "String", 9074 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9075 "howard calculation", 9076 "0", 9077 self.code_type_map.get("String"), 9078 ) 9079 9080 # Update 9081 sql_update = f""" 9082 UPDATE variants 9083 SET "INFO" = 9084 concat( 9085 CASE 9086 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9087 THEN '' 9088 ELSE concat("INFO", ';') 9089 END, 9090 CASE 9091 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9092 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9093 THEN concat( 9094 '{findbypipeline_tag}=', 9095 dataframe_findbypipeline."{findbypipeline_infos}" 9096 ) 9097 ELSE '' 9098 END 9099 ) 9100 FROM dataframe_findbypipeline 9101 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9102 """ 9103 self.conn.execute(sql_update) 9104 9105 # Remove added columns 9106 for added_column in added_columns: 9107 self.drop_column(column=added_column) 9108 9109 # Delete dataframe 9110 del dataframe_findbypipeline 9111 gc.collect() 9112 9113 def calculation_genotype_concordance(self) -> None: 9114 """ 9115 The function `calculation_genotype_concordance` calculates the genotype concordance for 9116 multi-caller VCF files and updates the variant information in the database. 9117 """ 9118 9119 # if FORMAT and samples 9120 if ( 9121 "FORMAT" in self.get_header_columns_as_list() 9122 and self.get_header_sample_list() 9123 ): 9124 9125 # genotypeconcordance annotation field 9126 genotypeconcordance_tag = "genotypeconcordance" 9127 9128 # VCF infos tags 9129 vcf_infos_tags = { 9130 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9131 } 9132 9133 # Prefix 9134 prefix = self.get_explode_infos_prefix() 9135 9136 # Field 9137 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9138 9139 # Variants table 9140 table_variants = self.get_table_variants() 9141 9142 # Header 9143 vcf_reader = self.get_header() 9144 9145 # Create variant id 9146 variant_id_column = self.get_variant_id_column() 9147 added_columns = [variant_id_column] 9148 9149 # variant_id, FORMAT and samples 9150 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9151 self.get_header_sample_list() 9152 ) 9153 9154 # Create dataframe 9155 dataframe_genotypeconcordance = self.get_query_to_df( 9156 f""" SELECT {samples_fields} FROM {table_variants} """ 9157 ) 9158 9159 # Create genotypeconcordance column 9160 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9161 dataframe_genotypeconcordance.apply( 9162 lambda row: genotypeconcordance( 9163 row, samples=self.get_header_sample_list() 9164 ), 9165 axis=1, 9166 ) 9167 ) 9168 9169 # Add genotypeconcordance to header 9170 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9171 genotypeconcordance_tag, 9172 ".", 9173 "String", 9174 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9175 "howard calculation", 9176 "0", 9177 self.code_type_map.get("String"), 9178 ) 9179 9180 # Update 9181 sql_update = f""" 9182 UPDATE variants 9183 SET "INFO" = 9184 concat( 9185 CASE 9186 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9187 THEN '' 9188 ELSE concat("INFO", ';') 9189 END, 9190 CASE 9191 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9192 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9193 THEN concat( 9194 '{genotypeconcordance_tag}=', 9195 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9196 ) 9197 ELSE '' 9198 END 9199 ) 9200 FROM dataframe_genotypeconcordance 9201 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9202 """ 9203 self.conn.execute(sql_update) 9204 9205 # Remove added columns 9206 for added_column in added_columns: 9207 self.drop_column(column=added_column) 9208 9209 # Delete dataframe 9210 del dataframe_genotypeconcordance 9211 gc.collect() 9212 9213 def calculation_barcode(self, tag: str = "barcode") -> None: 9214 """ 9215 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9216 updates the INFO field in the file with the calculated barcode values. 9217 9218 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9219 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9220 the default tag name is set to "barcode", defaults to barcode 9221 :type tag: str (optional) 9222 """ 9223 9224 # if FORMAT and samples 9225 if ( 9226 "FORMAT" in self.get_header_columns_as_list() 9227 and self.get_header_sample_list() 9228 ): 9229 9230 # barcode annotation field 9231 if not tag: 9232 tag = "barcode" 9233 9234 # VCF infos tags 9235 vcf_infos_tags = { 9236 tag: "barcode calculation (VaRank)", 9237 } 9238 9239 # Prefix 9240 prefix = self.get_explode_infos_prefix() 9241 9242 # Field 9243 barcode_infos = prefix + tag 9244 9245 # Variants table 9246 table_variants = self.get_table_variants() 9247 9248 # Header 9249 vcf_reader = self.get_header() 9250 9251 # Create variant id 9252 variant_id_column = self.get_variant_id_column() 9253 added_columns = [variant_id_column] 9254 9255 # variant_id, FORMAT and samples 9256 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9257 self.get_header_sample_list() 9258 ) 9259 9260 # Create dataframe 9261 dataframe_barcode = self.get_query_to_df( 9262 f""" SELECT {samples_fields} FROM {table_variants} """ 9263 ) 9264 9265 # Create barcode column 9266 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9267 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9268 ) 9269 9270 # Add barcode to header 9271 vcf_reader.infos[tag] = vcf.parser._Info( 9272 tag, 9273 ".", 9274 "String", 9275 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9276 "howard calculation", 9277 "0", 9278 self.code_type_map.get("String"), 9279 ) 9280 9281 # Update 9282 sql_update = f""" 9283 UPDATE {table_variants} 9284 SET "INFO" = 9285 concat( 9286 CASE 9287 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9288 THEN '' 9289 ELSE concat("INFO", ';') 9290 END, 9291 CASE 9292 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9293 AND dataframe_barcode."{barcode_infos}" NOT NULL 9294 THEN concat( 9295 '{tag}=', 9296 dataframe_barcode."{barcode_infos}" 9297 ) 9298 ELSE '' 9299 END 9300 ) 9301 FROM dataframe_barcode 9302 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9303 """ 9304 self.conn.execute(sql_update) 9305 9306 # Remove added columns 9307 for added_column in added_columns: 9308 self.drop_column(column=added_column) 9309 9310 # Delete dataframe 9311 del dataframe_barcode 9312 gc.collect() 9313 9314 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9315 """ 9316 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9317 and updates the INFO field in the file with the calculated barcode values. 9318 9319 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9320 the barcode tag that will be added to the VCF file during the calculation process. If no value 9321 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9322 :type tag: str (optional) 9323 """ 9324 9325 # if FORMAT and samples 9326 if ( 9327 "FORMAT" in self.get_header_columns_as_list() 9328 and self.get_header_sample_list() 9329 ): 9330 9331 # barcode annotation field 9332 if not tag: 9333 tag = "BCF" 9334 9335 # VCF infos tags 9336 vcf_infos_tags = { 9337 tag: "barcode family calculation", 9338 f"{tag}S": "barcode family samples", 9339 } 9340 9341 # Param 9342 param = self.get_param() 9343 log.debug(f"param={param}") 9344 9345 # Prefix 9346 prefix = self.get_explode_infos_prefix() 9347 9348 # PED param 9349 ped = ( 9350 param.get("calculation", {}) 9351 .get("calculations", {}) 9352 .get("BARCODEFAMILY", {}) 9353 .get("family_pedigree", None) 9354 ) 9355 log.debug(f"ped={ped}") 9356 9357 # Load PED 9358 if ped: 9359 9360 # Pedigree is a file 9361 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9362 log.debug("Pedigree is file") 9363 with open(full_path(ped)) as ped: 9364 ped = json.load(ped) 9365 9366 # Pedigree is a string 9367 elif isinstance(ped, str): 9368 log.debug("Pedigree is str") 9369 try: 9370 ped = json.loads(ped) 9371 log.debug("Pedigree is json str") 9372 except ValueError as e: 9373 ped_samples = ped.split(",") 9374 ped = {} 9375 for ped_sample in ped_samples: 9376 ped[ped_sample] = ped_sample 9377 9378 # Pedigree is a dict 9379 elif isinstance(ped, dict): 9380 log.debug("Pedigree is dict") 9381 9382 # Pedigree is not well formatted 9383 else: 9384 msg_error = "Pedigree not well formatted" 9385 log.error(msg_error) 9386 raise ValueError(msg_error) 9387 9388 # Construct list 9389 ped_samples = list(ped.values()) 9390 9391 else: 9392 log.debug("Pedigree not defined. Take all samples") 9393 ped_samples = self.get_header_sample_list() 9394 ped = {} 9395 for ped_sample in ped_samples: 9396 ped[ped_sample] = ped_sample 9397 9398 # Check pedigree 9399 if not ped or len(ped) == 0: 9400 msg_error = f"Error in pedigree: samples {ped_samples}" 9401 log.error(msg_error) 9402 raise ValueError(msg_error) 9403 9404 # Log 9405 log.info( 9406 "Calculation 'BARCODEFAMILY' - Samples: " 9407 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9408 ) 9409 log.debug(f"ped_samples={ped_samples}") 9410 9411 # Field 9412 barcode_infos = prefix + tag 9413 9414 # Variants table 9415 table_variants = self.get_table_variants() 9416 9417 # Header 9418 vcf_reader = self.get_header() 9419 9420 # Create variant id 9421 variant_id_column = self.get_variant_id_column() 9422 added_columns = [variant_id_column] 9423 9424 # variant_id, FORMAT and samples 9425 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9426 ped_samples 9427 ) 9428 9429 # Create dataframe 9430 dataframe_barcode = self.get_query_to_df( 9431 f""" SELECT {samples_fields} FROM {table_variants} """ 9432 ) 9433 9434 # Create barcode column 9435 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9436 lambda row: barcode(row, samples=ped_samples), axis=1 9437 ) 9438 9439 # Add barcode family to header 9440 # Add vaf_normalization to header 9441 vcf_reader.formats[tag] = vcf.parser._Format( 9442 id=tag, 9443 num=".", 9444 type="String", 9445 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9446 type_code=self.code_type_map.get("String"), 9447 ) 9448 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9449 id=f"{tag}S", 9450 num=".", 9451 type="String", 9452 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9453 type_code=self.code_type_map.get("String"), 9454 ) 9455 9456 # Update 9457 # for sample in ped_samples: 9458 sql_update_set = [] 9459 for sample in self.get_header_sample_list() + ["FORMAT"]: 9460 if sample in ped_samples: 9461 value = f'dataframe_barcode."{barcode_infos}"' 9462 value_samples = "'" + ",".join(ped_samples) + "'" 9463 elif sample == "FORMAT": 9464 value = f"'{tag}'" 9465 value_samples = f"'{tag}S'" 9466 else: 9467 value = "'.'" 9468 value_samples = "'.'" 9469 format_regex = r"[a-zA-Z0-9\s]" 9470 sql_update_set.append( 9471 f""" 9472 "{sample}" = 9473 concat( 9474 CASE 9475 WHEN {table_variants}."{sample}" = './.' 9476 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9477 ELSE {table_variants}."{sample}" 9478 END, 9479 ':', 9480 {value}, 9481 ':', 9482 {value_samples} 9483 ) 9484 """ 9485 ) 9486 9487 sql_update_set_join = ", ".join(sql_update_set) 9488 sql_update = f""" 9489 UPDATE {table_variants} 9490 SET {sql_update_set_join} 9491 FROM dataframe_barcode 9492 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9493 """ 9494 self.conn.execute(sql_update) 9495 9496 # Remove added columns 9497 for added_column in added_columns: 9498 self.drop_column(column=added_column) 9499 9500 # Delete dataframe 9501 del dataframe_barcode 9502 gc.collect() 9503 9504 def calculation_trio(self) -> None: 9505 """ 9506 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9507 information to the INFO field of each variant. 9508 """ 9509 9510 # if FORMAT and samples 9511 if ( 9512 "FORMAT" in self.get_header_columns_as_list() 9513 and self.get_header_sample_list() 9514 ): 9515 9516 # trio annotation field 9517 trio_tag = "trio" 9518 9519 # VCF infos tags 9520 vcf_infos_tags = { 9521 "trio": "trio calculation", 9522 } 9523 9524 # Param 9525 param = self.get_param() 9526 9527 # Prefix 9528 prefix = self.get_explode_infos_prefix() 9529 9530 # Trio param 9531 trio_ped = ( 9532 param.get("calculation", {}) 9533 .get("calculations", {}) 9534 .get("TRIO", {}) 9535 .get("trio_pedigree", None) 9536 ) 9537 9538 # Load trio 9539 if trio_ped: 9540 9541 # Trio pedigree is a file 9542 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9543 log.debug("TRIO pedigree is file") 9544 with open(full_path(trio_ped)) as trio_ped: 9545 trio_ped = json.load(trio_ped) 9546 9547 # Trio pedigree is a string 9548 elif isinstance(trio_ped, str): 9549 log.debug("TRIO pedigree is str") 9550 try: 9551 trio_ped = json.loads(trio_ped) 9552 log.debug("TRIO pedigree is json str") 9553 except ValueError as e: 9554 trio_samples = trio_ped.split(",") 9555 if len(trio_samples) == 3: 9556 trio_ped = { 9557 "father": trio_samples[0], 9558 "mother": trio_samples[1], 9559 "child": trio_samples[2], 9560 } 9561 log.debug("TRIO pedigree is list str") 9562 else: 9563 msg_error = "TRIO pedigree not well formatted" 9564 log.error(msg_error) 9565 raise ValueError(msg_error) 9566 9567 # Trio pedigree is a dict 9568 elif isinstance(trio_ped, dict): 9569 log.debug("TRIO pedigree is dict") 9570 9571 # Trio pedigree is not well formatted 9572 else: 9573 msg_error = "TRIO pedigree not well formatted" 9574 log.error(msg_error) 9575 raise ValueError(msg_error) 9576 9577 # Construct trio list 9578 trio_samples = [ 9579 trio_ped.get("father", ""), 9580 trio_ped.get("mother", ""), 9581 trio_ped.get("child", ""), 9582 ] 9583 9584 else: 9585 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9586 samples_list = self.get_header_sample_list() 9587 if len(samples_list) >= 3: 9588 trio_samples = self.get_header_sample_list()[0:3] 9589 trio_ped = { 9590 "father": trio_samples[0], 9591 "mother": trio_samples[1], 9592 "child": trio_samples[2], 9593 } 9594 else: 9595 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9596 log.error(msg_error) 9597 raise ValueError(msg_error) 9598 9599 # Check trio pedigree 9600 if not trio_ped or len(trio_ped) != 3: 9601 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9602 log.error(msg_error) 9603 raise ValueError(msg_error) 9604 9605 # Log 9606 log.info( 9607 f"Calculation 'TRIO' - Samples: " 9608 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9609 ) 9610 9611 # Field 9612 trio_infos = prefix + trio_tag 9613 9614 # Variants table 9615 table_variants = self.get_table_variants() 9616 9617 # Header 9618 vcf_reader = self.get_header() 9619 9620 # Create variant id 9621 variant_id_column = self.get_variant_id_column() 9622 added_columns = [variant_id_column] 9623 9624 # variant_id, FORMAT and samples 9625 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9626 self.get_header_sample_list() 9627 ) 9628 9629 # Create dataframe 9630 dataframe_trio = self.get_query_to_df( 9631 f""" SELECT {samples_fields} FROM {table_variants} """ 9632 ) 9633 9634 # Create trio column 9635 dataframe_trio[trio_infos] = dataframe_trio.apply( 9636 lambda row: trio(row, samples=trio_samples), axis=1 9637 ) 9638 9639 # Add trio to header 9640 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9641 trio_tag, 9642 ".", 9643 "String", 9644 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9645 "howard calculation", 9646 "0", 9647 self.code_type_map.get("String"), 9648 ) 9649 9650 # Update 9651 sql_update = f""" 9652 UPDATE {table_variants} 9653 SET "INFO" = 9654 concat( 9655 CASE 9656 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9657 THEN '' 9658 ELSE concat("INFO", ';') 9659 END, 9660 CASE 9661 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9662 AND dataframe_trio."{trio_infos}" NOT NULL 9663 THEN concat( 9664 '{trio_tag}=', 9665 dataframe_trio."{trio_infos}" 9666 ) 9667 ELSE '' 9668 END 9669 ) 9670 FROM dataframe_trio 9671 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9672 """ 9673 self.conn.execute(sql_update) 9674 9675 # Remove added columns 9676 for added_column in added_columns: 9677 self.drop_column(column=added_column) 9678 9679 # Delete dataframe 9680 del dataframe_trio 9681 gc.collect() 9682 9683 def calculation_vaf_normalization(self) -> None: 9684 """ 9685 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9686 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9687 :return: The function does not return anything. 9688 """ 9689 9690 # if FORMAT and samples 9691 if ( 9692 "FORMAT" in self.get_header_columns_as_list() 9693 and self.get_header_sample_list() 9694 ): 9695 9696 # vaf_normalization annotation field 9697 vaf_normalization_tag = "VAF" 9698 9699 # VCF infos tags 9700 vcf_infos_tags = { 9701 "VAF": "VAF Variant Frequency", 9702 } 9703 9704 # Prefix 9705 prefix = self.get_explode_infos_prefix() 9706 9707 # Variants table 9708 table_variants = self.get_table_variants() 9709 9710 # Header 9711 vcf_reader = self.get_header() 9712 9713 # Do not calculate if VAF already exists 9714 if "VAF" in vcf_reader.formats: 9715 log.debug("VAF already on genotypes") 9716 return 9717 9718 # Create variant id 9719 variant_id_column = self.get_variant_id_column() 9720 added_columns = [variant_id_column] 9721 9722 # variant_id, FORMAT and samples 9723 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9724 f""" "{sample}" """ for sample in self.get_header_sample_list() 9725 ) 9726 9727 # Create dataframe 9728 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9729 log.debug(f"query={query}") 9730 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9731 9732 vaf_normalization_set = [] 9733 9734 # for each sample vaf_normalization 9735 for sample in self.get_header_sample_list(): 9736 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9737 lambda row: vaf_normalization(row, sample=sample), axis=1 9738 ) 9739 vaf_normalization_set.append( 9740 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9741 ) 9742 9743 # Add VAF to FORMAT 9744 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9745 "FORMAT" 9746 ].apply(lambda x: str(x) + ":VAF") 9747 vaf_normalization_set.append( 9748 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9749 ) 9750 9751 # Add vaf_normalization to header 9752 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9753 id=vaf_normalization_tag, 9754 num="1", 9755 type="Float", 9756 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9757 type_code=self.code_type_map.get("Float"), 9758 ) 9759 9760 # Create fields to add in INFO 9761 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9762 9763 # Update 9764 sql_update = f""" 9765 UPDATE {table_variants} 9766 SET {sql_vaf_normalization_set} 9767 FROM dataframe_vaf_normalization 9768 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9769 9770 """ 9771 self.conn.execute(sql_update) 9772 9773 # Remove added columns 9774 for added_column in added_columns: 9775 self.drop_column(column=added_column) 9776 9777 # Delete dataframe 9778 del dataframe_vaf_normalization 9779 gc.collect() 9780 9781 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9782 """ 9783 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9784 field in a VCF file and updates the INFO column of the variants table with the calculated 9785 statistics. 9786 9787 :param info: The `info` parameter is a string that represents the type of information for which 9788 genotype statistics are calculated. It is used to generate various VCF info tags for the 9789 statistics, such as the number of occurrences, the list of values, the minimum value, the 9790 maximum value, the mean, the median, defaults to VAF 9791 :type info: str (optional) 9792 """ 9793 9794 # if FORMAT and samples 9795 if ( 9796 "FORMAT" in self.get_header_columns_as_list() 9797 and self.get_header_sample_list() 9798 ): 9799 9800 # vaf_stats annotation field 9801 vaf_stats_tag = info + "_stats" 9802 9803 # VCF infos tags 9804 vcf_infos_tags = { 9805 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9806 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9807 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9808 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9809 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9810 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9811 info 9812 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9813 } 9814 9815 # Prefix 9816 prefix = self.get_explode_infos_prefix() 9817 9818 # Field 9819 vaf_stats_infos = prefix + vaf_stats_tag 9820 9821 # Variants table 9822 table_variants = self.get_table_variants() 9823 9824 # Header 9825 vcf_reader = self.get_header() 9826 9827 # Create variant id 9828 variant_id_column = self.get_variant_id_column() 9829 added_columns = [variant_id_column] 9830 9831 # variant_id, FORMAT and samples 9832 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9833 self.get_header_sample_list() 9834 ) 9835 9836 # Create dataframe 9837 dataframe_vaf_stats = self.get_query_to_df( 9838 f""" SELECT {samples_fields} FROM {table_variants} """ 9839 ) 9840 9841 # Create vaf_stats column 9842 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9843 lambda row: genotype_stats( 9844 row, samples=self.get_header_sample_list(), info=info 9845 ), 9846 axis=1, 9847 ) 9848 9849 # List of vcf tags 9850 sql_vaf_stats_fields = [] 9851 9852 # Check all VAF stats infos 9853 for stat in vcf_infos_tags: 9854 9855 # Extract stats 9856 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9857 lambda x: dict(x).get(stat, "") 9858 ) 9859 9860 # Add snpeff_hgvs to header 9861 vcf_reader.infos[stat] = vcf.parser._Info( 9862 stat, 9863 ".", 9864 "String", 9865 vcf_infos_tags.get(stat, "genotype statistics"), 9866 "howard calculation", 9867 "0", 9868 self.code_type_map.get("String"), 9869 ) 9870 9871 if len(sql_vaf_stats_fields): 9872 sep = ";" 9873 else: 9874 sep = "" 9875 9876 # Create fields to add in INFO 9877 sql_vaf_stats_fields.append( 9878 f""" 9879 CASE 9880 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9881 THEN concat( 9882 '{sep}{stat}=', 9883 dataframe_vaf_stats."{stat}" 9884 ) 9885 ELSE '' 9886 END 9887 """ 9888 ) 9889 9890 # SQL set for update 9891 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9892 9893 # Update 9894 sql_update = f""" 9895 UPDATE {table_variants} 9896 SET "INFO" = 9897 concat( 9898 CASE 9899 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9900 THEN '' 9901 ELSE concat("INFO", ';') 9902 END, 9903 {sql_vaf_stats_fields_set} 9904 ) 9905 FROM dataframe_vaf_stats 9906 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9907 9908 """ 9909 self.conn.execute(sql_update) 9910 9911 # Remove added columns 9912 for added_column in added_columns: 9913 self.drop_column(column=added_column) 9914 9915 # Delete dataframe 9916 del dataframe_vaf_stats 9917 gc.collect() 9918 9919 def calculation_transcripts_annotation( 9920 self, info_json: str = None, info_format: str = None 9921 ) -> None: 9922 """ 9923 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9924 field to it if transcripts are available. 9925 9926 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9927 is a string parameter that represents the information field to be used in the transcripts JSON. 9928 It is used to specify the JSON format for the transcripts information. If no value is provided 9929 when calling the method, it defaults to " 9930 :type info_json: str 9931 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9932 method is a string parameter that specifies the format of the information field to be used in 9933 the transcripts JSON. It is used to define the format of the information field 9934 :type info_format: str 9935 """ 9936 9937 # Create transcripts table 9938 transcripts_table = self.create_transcript_view() 9939 9940 # Add info field 9941 if transcripts_table: 9942 self.transcript_view_to_variants( 9943 transcripts_table=transcripts_table, 9944 transcripts_info_field_json=info_json, 9945 transcripts_info_field_format=info_format, 9946 ) 9947 else: 9948 log.info("No Transcripts to process. Check param.json file configuration") 9949 9950 def calculation_transcripts_prioritization(self) -> None: 9951 """ 9952 The function `calculation_transcripts_prioritization` creates a transcripts table and 9953 prioritizes transcripts based on certain criteria. 9954 """ 9955 9956 # Create transcripts table 9957 transcripts_table = self.create_transcript_view() 9958 9959 # Add info field 9960 if transcripts_table: 9961 self.transcripts_prioritization(transcripts_table=transcripts_table) 9962 else: 9963 log.info("No Transcripts to process. Check param.json file configuration") 9964 9965 def calculation_transcripts_export(self) -> None: 9966 """ """ 9967 9968 # Create transcripts table 9969 transcripts_table = self.create_transcript_view() 9970 9971 # Add info field 9972 if transcripts_table: 9973 self.transcripts_export(transcripts_table=transcripts_table) 9974 else: 9975 log.info("No Transcripts to process. Check param.json file configuration") 9976 9977 ############### 9978 # Transcripts # 9979 ############### 9980 9981 def transcripts_export( 9982 self, transcripts_table: str = None, param: dict = {} 9983 ) -> bool: 9984 """ """ 9985 9986 log.debug("Start transcripts export...") 9987 9988 # Param 9989 if not param: 9990 param = self.get_param() 9991 9992 # Param export 9993 param_transcript_export = param.get("transcripts", {}).get("export", {}) 9994 9995 # Output file 9996 transcripts_export_output = param_transcript_export.get("output", None) 9997 9998 if not param_transcript_export or not transcripts_export_output: 9999 log.warning(f"No transcriipts export parameters defined!") 10000 return False 10001 10002 # List of transcripts annotations 10003 query_describe = f""" 10004 SELECT column_name 10005 FROM ( 10006 DESCRIBE SELECT * FROM {transcripts_table} 10007 ) 10008 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10009 """ 10010 transcripts_annotations_list = list( 10011 self.get_query_to_df(query=query_describe)["column_name"] 10012 ) 10013 10014 # Create transcripts table for export 10015 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10016 random.choices(string.ascii_uppercase + string.digits, k=10) 10017 ) 10018 query_create_transcripts_table_export = f""" 10019 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10020 """ 10021 self.execute_query(query=query_create_transcripts_table_export) 10022 10023 # Output file format 10024 transcripts_export_output_format = get_file_format( 10025 filename=transcripts_export_output 10026 ) 10027 10028 # Format VCF - construct INFO 10029 if transcripts_export_output_format in ["vcf"]: 10030 10031 # Construct query update INFO and header 10032 query_update_info = [] 10033 for field in transcripts_annotations_list: 10034 10035 # If field not in header 10036 if field not in self.get_header_infos_list(): 10037 10038 # Add PZ Transcript in header 10039 self.get_header().infos[field] = vcf.parser._Info( 10040 field, 10041 ".", 10042 "String", 10043 f"Annotation '{field}' from transcript view", 10044 "unknown", 10045 "unknown", 10046 0, 10047 ) 10048 10049 # Add field as INFO/tag 10050 query_update_info.append( 10051 f""" 10052 CASE 10053 WHEN "{field}" IS NOT NULL 10054 THEN concat('{field}=', "{field}", ';') 10055 ELSE '' 10056 END 10057 """ 10058 ) 10059 10060 # Query param 10061 query_update_info_value = ( 10062 f""" concat('', {", ".join(query_update_info)}) """ 10063 ) 10064 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10065 10066 else: 10067 10068 # Query param 10069 query_update_info_value = f""" NULL """ 10070 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10071 10072 # Update query INFO column 10073 query_update = f""" 10074 UPDATE {transcripts_table_export} 10075 SET INFO = {query_update_info_value} 10076 10077 """ 10078 self.execute_query(query=query_update) 10079 10080 # Export 10081 self.export_output( 10082 output_file=transcripts_export_output, 10083 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10084 ) 10085 10086 # Drop transcripts export table 10087 query_drop_transcripts_table_export = f""" 10088 DROP TABLE {transcripts_table_export} 10089 """ 10090 self.execute_query(query=query_drop_transcripts_table_export) 10091 10092 def transcripts_prioritization( 10093 self, transcripts_table: str = None, param: dict = {} 10094 ) -> bool: 10095 """ 10096 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10097 and updates the variants table with the prioritized information. 10098 10099 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10100 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10101 This parameter is used to identify the table where the transcripts data is stored for the 10102 prioritization process 10103 :type transcripts_table: str 10104 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10105 that contains various configuration settings for the prioritization process of transcripts. It 10106 is used to customize the behavior of the prioritization algorithm and includes settings such as 10107 the prefix for prioritization fields, default profiles, and other 10108 :type param: dict 10109 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10110 transcripts prioritization process is successfully completed, and `False` if there are any 10111 issues or if no profile is defined for transcripts prioritization. 10112 """ 10113 10114 log.debug("Start transcripts prioritization...") 10115 10116 # Param 10117 if not param: 10118 param = self.get_param() 10119 10120 # Variants table 10121 table_variants = self.get_table_variants() 10122 10123 # Transcripts table 10124 if transcripts_table is None: 10125 transcripts_table = self.create_transcript_view( 10126 transcripts_table="transcripts", param=param 10127 ) 10128 if transcripts_table is None: 10129 msg_err = "No Transcripts table availalble" 10130 log.error(msg_err) 10131 raise ValueError(msg_err) 10132 log.debug(f"transcripts_table={transcripts_table}") 10133 10134 # Get transcripts columns 10135 columns_as_list_query = f""" 10136 DESCRIBE {transcripts_table} 10137 """ 10138 columns_as_list = list( 10139 self.get_query_to_df(columns_as_list_query)["column_name"] 10140 ) 10141 10142 # Create INFO if not exists 10143 if "INFO" not in columns_as_list: 10144 query_add_info = f""" 10145 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10146 """ 10147 self.execute_query(query_add_info) 10148 10149 # Prioritization param and Force only PZ Score and Flag 10150 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10151 10152 # PZ profile by default 10153 pz_profile_default = ( 10154 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10155 ) 10156 10157 # Exit if no profile 10158 if pz_profile_default is None: 10159 log.warning("No profile defined for transcripts prioritization") 10160 return False 10161 10162 # PZ fields 10163 pz_param_pzfields = {} 10164 10165 # PZ field transcripts 10166 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10167 10168 # Add PZ Transcript in header 10169 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10170 pz_fields_transcripts, 10171 ".", 10172 "String", 10173 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10174 "unknown", 10175 "unknown", 10176 code_type_map["String"], 10177 ) 10178 10179 # Mandatory fields 10180 pz_mandatory_fields_list = [ 10181 "Score", 10182 "Flag", 10183 "Tags", 10184 "Comment", 10185 "Infos", 10186 "Class", 10187 ] 10188 pz_mandatory_fields = [] 10189 for pz_mandatory_field in pz_mandatory_fields_list: 10190 pz_mandatory_fields.append( 10191 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10192 ) 10193 10194 # PZ fields in param 10195 for pz_field in pz_param.get("pzfields", []): 10196 if pz_field in pz_mandatory_fields_list: 10197 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10198 pz_param.get("pzprefix", "PTZ") + pz_field 10199 ) 10200 else: 10201 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10202 pz_param_pzfields[pz_field] = pz_field_new 10203 10204 # Add PZ Transcript in header 10205 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10206 pz_field_new, 10207 ".", 10208 "String", 10209 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10210 "unknown", 10211 "unknown", 10212 code_type_map["String"], 10213 ) 10214 10215 # PZ fields param 10216 pz_param["pzfields"] = pz_mandatory_fields 10217 10218 # Prioritization 10219 prioritization_result = self.prioritization( 10220 table=transcripts_table, 10221 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10222 ) 10223 if not prioritization_result: 10224 log.warning("Transcripts prioritization not processed") 10225 return False 10226 10227 # PZ fields sql query 10228 query_update_select_list = [] 10229 query_update_concat_list = [] 10230 query_update_order_list = [] 10231 for pz_param_pzfield in set( 10232 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10233 ): 10234 query_update_select_list.append(f" {pz_param_pzfield}, ") 10235 10236 for pz_param_pzfield in pz_param_pzfields: 10237 query_update_concat_list.append( 10238 f""" 10239 , CASE 10240 WHEN {pz_param_pzfield} IS NOT NULL 10241 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10242 ELSE '' 10243 END 10244 """ 10245 ) 10246 10247 # Order by 10248 pz_orders = ( 10249 param.get("transcripts", {}) 10250 .get("prioritization", {}) 10251 .get("prioritization_transcripts_order", {}) 10252 ) 10253 if not pz_orders: 10254 pz_orders = { 10255 pz_param.get("pzprefix", "PTZ") + "Flag": "ASC", 10256 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10257 } 10258 for pz_order in pz_orders: 10259 query_update_order_list.append( 10260 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10261 ) 10262 10263 # Fields to explode 10264 fields_to_explode = ( 10265 list(pz_param_pzfields.keys()) 10266 + pz_mandatory_fields 10267 + list(pz_orders.keys()) 10268 ) 10269 # Remove transcript column as a specific transcript column 10270 if "transcript" in fields_to_explode: 10271 fields_to_explode.remove("transcript") 10272 10273 # Fields intranscripts table 10274 query_transcripts_table = f""" 10275 DESCRIBE SELECT * FROM {transcripts_table} 10276 """ 10277 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10278 10279 # Check fields to explode 10280 for field_to_explode in fields_to_explode: 10281 if field_to_explode not in self.get_header_infos_list() + list( 10282 query_transcripts_table.column_name 10283 ): 10284 msg_err = f"INFO/{field_to_explode} NOT IN header" 10285 log.error(msg_err) 10286 raise ValueError(msg_err) 10287 10288 # Explode fields to explode 10289 self.explode_infos( 10290 table=transcripts_table, 10291 fields=fields_to_explode, 10292 ) 10293 10294 # Transcript preference file 10295 transcripts_preference_file = ( 10296 param.get("transcripts", {}) 10297 .get("prioritization", {}) 10298 .get("prioritization_transcripts", {}) 10299 ) 10300 transcripts_preference_file = full_path(transcripts_preference_file) 10301 10302 # Transcript preference forced 10303 transcript_preference_force = ( 10304 param.get("transcripts", {}) 10305 .get("prioritization", {}) 10306 .get("prioritization_transcripts_force", False) 10307 ) 10308 # Transcript version forced 10309 transcript_version_force = ( 10310 param.get("transcripts", {}) 10311 .get("prioritization", {}) 10312 .get("prioritization_transcripts_version_force", False) 10313 ) 10314 10315 # Transcripts Ranking 10316 if transcripts_preference_file: 10317 10318 # Transcripts file to dataframe 10319 if os.path.exists(transcripts_preference_file): 10320 transcripts_preference_dataframe = transcripts_file_to_df( 10321 transcripts_preference_file 10322 ) 10323 else: 10324 log.error( 10325 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10326 ) 10327 raise ValueError( 10328 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10329 ) 10330 10331 # Order by depending to transcript preference forcing 10332 if transcript_preference_force: 10333 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10334 else: 10335 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10336 10337 # Transcript columns joined depend on version consideration 10338 if transcript_version_force: 10339 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10340 else: 10341 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10342 10343 # Query ranking for update 10344 query_update_ranking = f""" 10345 SELECT 10346 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10347 ROW_NUMBER() OVER ( 10348 PARTITION BY "#CHROM", POS, REF, ALT 10349 ORDER BY {order_by} 10350 ) AS rn 10351 FROM {transcripts_table} 10352 LEFT JOIN 10353 ( 10354 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10355 FROM transcripts_preference_dataframe 10356 ) AS transcripts_preference 10357 ON {transcripts_version_join} 10358 """ 10359 10360 else: 10361 10362 # Query ranking for update 10363 query_update_ranking = f""" 10364 SELECT 10365 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10366 ROW_NUMBER() OVER ( 10367 PARTITION BY "#CHROM", POS, REF, ALT 10368 ORDER BY {" , ".join(query_update_order_list)} 10369 ) AS rn 10370 FROM {transcripts_table} 10371 """ 10372 10373 # Export Transcripts prioritization infos to variants table 10374 query_update = f""" 10375 WITH RankedTranscripts AS ( 10376 {query_update_ranking} 10377 ) 10378 UPDATE {table_variants} 10379 SET 10380 INFO = CONCAT(CASE 10381 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10382 THEN '' 10383 ELSE concat("INFO", ';') 10384 END, 10385 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10386 ) 10387 FROM 10388 RankedTranscripts 10389 WHERE 10390 rn = 1 10391 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10392 AND variants."POS" = RankedTranscripts."POS" 10393 AND variants."REF" = RankedTranscripts."REF" 10394 AND variants."ALT" = RankedTranscripts."ALT" 10395 """ 10396 10397 # log.debug(f"query_update={query_update}") 10398 self.execute_query(query=query_update) 10399 10400 # Return 10401 return True 10402 10403 def create_transcript_view_from_columns_map( 10404 self, 10405 transcripts_table: str = "transcripts", 10406 columns_maps: dict = {}, 10407 added_columns: list = [], 10408 temporary_tables: list = None, 10409 annotation_fields: list = None, 10410 column_rename: dict = {}, 10411 column_clean: bool = False, 10412 column_case: str = None, 10413 ) -> tuple[list, list, list]: 10414 """ 10415 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10416 specified columns mapping for transcripts data. 10417 10418 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10419 of the table where the transcripts data is stored or will be stored in the database. This table 10420 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10421 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10422 :type transcripts_table: str (optional) 10423 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10424 about how to map columns from a transcripts table to create a view. Each entry in the 10425 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10426 typically includes details such as the main transcript column and additional information columns 10427 :type columns_maps: dict 10428 :param added_columns: The `added_columns` parameter in the 10429 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10430 that will be added to the view being created based on the columns map provided. These columns 10431 are generated by exploding the transcript information columns along with the main transcript 10432 column 10433 :type added_columns: list 10434 :param temporary_tables: The `temporary_tables` parameter in the 10435 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10436 tables created during the process of creating a transcript view from a columns map. These 10437 temporary tables are used to store intermediate results or transformations before the final view 10438 is generated 10439 :type temporary_tables: list 10440 :param annotation_fields: The `annotation_fields` parameter in the 10441 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10442 used for annotation in the query view creation process. These fields are extracted from the 10443 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10444 :type annotation_fields: list 10445 :param column_rename: The `column_rename` parameter in the 10446 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10447 custom renaming for columns during the creation of the temporary table view. This parameter 10448 provides a mapping of original column names to the desired renamed column names. By using this 10449 parameter, 10450 :type column_rename: dict 10451 :param column_clean: The `column_clean` parameter in the 10452 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10453 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10454 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10455 False 10456 :type column_clean: bool (optional) 10457 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10458 function is used to specify the case transformation to be applied to the columns during the view 10459 creation process. It allows you to control whether the column values should be converted to 10460 lowercase, uppercase, or remain unchanged 10461 :type column_case: str 10462 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10463 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10464 """ 10465 10466 log.debug("Start transcrpts view creation from columns map...") 10467 10468 # "from_columns_map": [ 10469 # { 10470 # "transcripts_column": "Ensembl_transcriptid", 10471 # "transcripts_infos_columns": [ 10472 # "genename", 10473 # "Ensembl_geneid", 10474 # "LIST_S2_score", 10475 # "LIST_S2_pred", 10476 # ], 10477 # }, 10478 # { 10479 # "transcripts_column": "Ensembl_transcriptid", 10480 # "transcripts_infos_columns": [ 10481 # "genename", 10482 # "VARITY_R_score", 10483 # "Aloft_pred", 10484 # ], 10485 # }, 10486 # ], 10487 10488 # Init 10489 if temporary_tables is None: 10490 temporary_tables = [] 10491 if annotation_fields is None: 10492 annotation_fields = [] 10493 10494 # Variants table 10495 table_variants = self.get_table_variants() 10496 10497 for columns_map in columns_maps: 10498 10499 # Transcript column 10500 transcripts_column = columns_map.get("transcripts_column", None) 10501 10502 # Transcripts infos columns 10503 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10504 10505 # Transcripts infos columns rename 10506 column_rename = columns_map.get("column_rename", column_rename) 10507 10508 # Transcripts infos columns clean 10509 column_clean = columns_map.get("column_clean", column_clean) 10510 10511 # Transcripts infos columns case 10512 column_case = columns_map.get("column_case", column_case) 10513 10514 if transcripts_column is not None: 10515 10516 # Explode 10517 added_columns += self.explode_infos( 10518 fields=[transcripts_column] + transcripts_infos_columns 10519 ) 10520 10521 # View clauses 10522 clause_select_variants = [] 10523 clause_select_tanscripts = [] 10524 for field in [transcripts_column] + transcripts_infos_columns: 10525 10526 # AS field 10527 as_field = field 10528 10529 # Rename 10530 if column_rename: 10531 as_field = column_rename.get(as_field, as_field) 10532 10533 # Clean 10534 if column_clean: 10535 as_field = clean_annotation_field(as_field) 10536 10537 # Case 10538 if column_case: 10539 if column_case.lower() in ["lower"]: 10540 as_field = as_field.lower() 10541 elif column_case.lower() in ["upper"]: 10542 as_field = as_field.upper() 10543 10544 # Clause select Variants 10545 clause_select_variants.append( 10546 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10547 ) 10548 10549 if field in [transcripts_column]: 10550 clause_select_tanscripts.append( 10551 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10552 ) 10553 else: 10554 clause_select_tanscripts.append( 10555 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10556 ) 10557 annotation_fields.append(as_field) 10558 10559 # Querey View 10560 query = f""" 10561 SELECT 10562 "#CHROM", POS, REF, ALT, INFO, 10563 "{transcripts_column}" AS 'transcript', 10564 {", ".join(clause_select_tanscripts)} 10565 FROM ( 10566 SELECT 10567 "#CHROM", POS, REF, ALT, INFO, 10568 {", ".join(clause_select_variants)} 10569 FROM {table_variants} 10570 ) 10571 WHERE "{transcripts_column}" IS NOT NULL 10572 """ 10573 10574 # Create temporary table 10575 temporary_table = transcripts_table + "".join( 10576 random.choices(string.ascii_uppercase + string.digits, k=10) 10577 ) 10578 10579 # Temporary_tables 10580 temporary_tables.append(temporary_table) 10581 query_view = f""" 10582 CREATE TEMPORARY TABLE {temporary_table} 10583 AS ({query}) 10584 """ 10585 self.execute_query(query=query_view) 10586 10587 return added_columns, temporary_tables, annotation_fields 10588 10589 def create_transcript_view_from_column_format( 10590 self, 10591 transcripts_table: str = "transcripts", 10592 column_formats: dict = {}, 10593 temporary_tables: list = None, 10594 annotation_fields: list = None, 10595 column_rename: dict = {}, 10596 column_clean: bool = False, 10597 column_case: str = None, 10598 ) -> tuple[list, list, list]: 10599 """ 10600 The `create_transcript_view_from_column_format` function generates a transcript view based on 10601 specified column formats, adds additional columns and annotation fields, and returns the list of 10602 temporary tables and annotation fields. 10603 10604 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10605 of the table containing the transcripts data. This table will be used as the base table for 10606 creating the transcript view. The default value for this parameter is "transcripts", but you can 10607 provide a different table name if needed, defaults to transcripts 10608 :type transcripts_table: str (optional) 10609 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10610 about the columns to be used for creating the transcript view. Each entry in the dictionary 10611 specifies the mapping between a transcripts column and a transcripts infos column. This 10612 parameter allows you to define how the columns from the transcripts table should be transformed 10613 or mapped 10614 :type column_formats: dict 10615 :param temporary_tables: The `temporary_tables` parameter in the 10616 `create_transcript_view_from_column_format` function is a list that stores the names of 10617 temporary views created during the process of creating a transcript view from a column format. 10618 These temporary views are used to manipulate and extract data before generating the final 10619 transcript view 10620 :type temporary_tables: list 10621 :param annotation_fields: The `annotation_fields` parameter in the 10622 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10623 that are extracted from the temporary views created during the process. These annotation fields 10624 are obtained by querying the temporary views and extracting the column names excluding specific 10625 columns like `#CH 10626 :type annotation_fields: list 10627 :param column_rename: The `column_rename` parameter in the 10628 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10629 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10630 column names to new column names in this dictionary, you can rename specific columns during the 10631 process 10632 :type column_rename: dict 10633 :param column_clean: The `column_clean` parameter in the 10634 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10635 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10636 will be cleaned during the creation of the transcript view based on the specified column format, 10637 defaults to False 10638 :type column_clean: bool (optional) 10639 :param column_case: The `column_case` parameter in the 10640 `create_transcript_view_from_column_format` function is used to specify the case transformation 10641 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10642 to convert the column names to uppercase or lowercase, respectively 10643 :type column_case: str 10644 :return: The `create_transcript_view_from_column_format` function returns two lists: 10645 `temporary_tables` and `annotation_fields`. 10646 """ 10647 10648 log.debug("Start transcrpts view creation from column format...") 10649 10650 # "from_column_format": [ 10651 # { 10652 # "transcripts_column": "ANN", 10653 # "transcripts_infos_column": "Feature_ID", 10654 # } 10655 # ], 10656 10657 # Init 10658 if temporary_tables is None: 10659 temporary_tables = [] 10660 if annotation_fields is None: 10661 annotation_fields = [] 10662 10663 for column_format in column_formats: 10664 10665 # annotation field and transcript annotation field 10666 annotation_field = column_format.get("transcripts_column", "ANN") 10667 transcript_annotation = column_format.get( 10668 "transcripts_infos_column", "Feature_ID" 10669 ) 10670 10671 # Transcripts infos columns rename 10672 column_rename = column_format.get("column_rename", column_rename) 10673 10674 # Transcripts infos columns clean 10675 column_clean = column_format.get("column_clean", column_clean) 10676 10677 # Transcripts infos columns case 10678 column_case = column_format.get("column_case", column_case) 10679 10680 # Temporary View name 10681 temporary_view_name = transcripts_table + "".join( 10682 random.choices(string.ascii_uppercase + string.digits, k=10) 10683 ) 10684 10685 # Create temporary view name 10686 temporary_view_name = self.annotation_format_to_table( 10687 uniquify=True, 10688 annotation_field=annotation_field, 10689 view_name=temporary_view_name, 10690 annotation_id=transcript_annotation, 10691 column_rename=column_rename, 10692 column_clean=column_clean, 10693 column_case=column_case, 10694 ) 10695 10696 # Annotation fields 10697 if temporary_view_name: 10698 query_annotation_fields = f""" 10699 SELECT * 10700 FROM ( 10701 DESCRIBE SELECT * 10702 FROM {temporary_view_name} 10703 ) 10704 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10705 """ 10706 df_annotation_fields = self.get_query_to_df( 10707 query=query_annotation_fields 10708 ) 10709 10710 # Add temporary view and annotation fields 10711 temporary_tables.append(temporary_view_name) 10712 annotation_fields += list(set(df_annotation_fields["column_name"])) 10713 10714 return temporary_tables, annotation_fields 10715 10716 def create_transcript_view( 10717 self, 10718 transcripts_table: str = None, 10719 transcripts_table_drop: bool = True, 10720 param: dict = {}, 10721 ) -> str: 10722 """ 10723 The `create_transcript_view` function generates a transcript view by processing data from a 10724 specified table based on provided parameters and structural information. 10725 10726 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10727 is used to specify the name of the table that will store the final transcript view data. If a table 10728 name is not provided, the function will create a new table to store the transcript view data, and by 10729 default,, defaults to transcripts 10730 :type transcripts_table: str (optional) 10731 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10732 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10733 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10734 the function will drop the existing transcripts table if it exists, defaults to True 10735 :type transcripts_table_drop: bool (optional) 10736 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10737 contains information needed to create a transcript view. It includes details such as the structure 10738 of the transcripts, columns mapping, column formats, and other necessary information for generating 10739 the view. This parameter allows for flexibility and customization 10740 :type param: dict 10741 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10742 created or modified during the execution of the function. 10743 """ 10744 10745 log.debug("Start transcripts view creation...") 10746 10747 # Default 10748 transcripts_table_default = "transcripts" 10749 10750 # Param 10751 if not param: 10752 param = self.get_param() 10753 10754 # Struct 10755 struct = param.get("transcripts", {}).get("struct", None) 10756 10757 # Transcript veresion 10758 transcript_id_remove_version = param.get("transcripts", {}).get( 10759 "transcript_id_remove_version", False 10760 ) 10761 10762 # Transcripts mapping 10763 transcript_id_mapping_file = param.get("transcripts", {}).get( 10764 "transcript_id_mapping_file", None 10765 ) 10766 10767 # Transcripts mapping 10768 transcript_id_mapping_force = param.get("transcripts", {}).get( 10769 "transcript_id_mapping_force", None 10770 ) 10771 10772 if struct: 10773 10774 # Transcripts table 10775 if transcripts_table is None: 10776 transcripts_table = param.get("transcripts", {}).get( 10777 "table", transcripts_table_default 10778 ) 10779 10780 # added_columns 10781 added_columns = [] 10782 10783 # Temporary tables 10784 temporary_tables = [] 10785 10786 # Annotation fields 10787 annotation_fields = [] 10788 10789 # from columns map 10790 columns_maps = struct.get("from_columns_map", []) 10791 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10792 self.create_transcript_view_from_columns_map( 10793 transcripts_table=transcripts_table, 10794 columns_maps=columns_maps, 10795 added_columns=added_columns, 10796 temporary_tables=temporary_tables, 10797 annotation_fields=annotation_fields, 10798 ) 10799 ) 10800 added_columns += added_columns_tmp 10801 temporary_tables += temporary_tables_tmp 10802 annotation_fields += annotation_fields_tmp 10803 10804 # from column format 10805 column_formats = struct.get("from_column_format", []) 10806 temporary_tables_tmp, annotation_fields_tmp = ( 10807 self.create_transcript_view_from_column_format( 10808 transcripts_table=transcripts_table, 10809 column_formats=column_formats, 10810 temporary_tables=temporary_tables, 10811 annotation_fields=annotation_fields, 10812 ) 10813 ) 10814 temporary_tables += temporary_tables_tmp 10815 annotation_fields += annotation_fields_tmp 10816 10817 # Remove some specific fields/column 10818 annotation_fields = list(set(annotation_fields)) 10819 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10820 if field in annotation_fields: 10821 annotation_fields.remove(field) 10822 10823 # Merge temporary tables query 10824 query_merge = "" 10825 for temporary_table in list(set(temporary_tables)): 10826 10827 # First temporary table 10828 if not query_merge: 10829 query_merge = f""" 10830 SELECT * FROM {temporary_table} 10831 """ 10832 # other temporary table (using UNION) 10833 else: 10834 query_merge += f""" 10835 UNION BY NAME SELECT * FROM {temporary_table} 10836 """ 10837 10838 # transcript table tmp 10839 transcript_table_tmp = "transcripts_tmp" 10840 transcript_table_tmp2 = "transcripts_tmp2" 10841 transcript_table_tmp3 = "transcripts_tmp3" 10842 10843 # Merge on transcript 10844 query_merge_on_transcripts_annotation_fields = [] 10845 10846 # Add transcript list 10847 query_merge_on_transcripts_annotation_fields.append( 10848 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10849 ) 10850 10851 # Aggregate all annotations fields 10852 for annotation_field in set(annotation_fields): 10853 query_merge_on_transcripts_annotation_fields.append( 10854 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10855 ) 10856 10857 # Transcripts mapping 10858 if transcript_id_mapping_file: 10859 10860 # Transcript dataframe 10861 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10862 transcript_id_mapping_dataframe = transcripts_file_to_df( 10863 transcript_id_mapping_file, column_names=["transcript", "alias"] 10864 ) 10865 10866 # Transcript version remove 10867 if transcript_id_remove_version: 10868 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10869 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10870 query_left_join = f""" 10871 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10872 """ 10873 else: 10874 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10875 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10876 query_left_join = f""" 10877 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10878 """ 10879 10880 # Transcript column for group by merge 10881 query_transcript_merge_group_by = """ 10882 CASE 10883 WHEN transcript_mapped NOT IN ('') 10884 THEN split_part(transcript_mapped, '.', 1) 10885 ELSE split_part(transcript_original, '.', 1) 10886 END 10887 """ 10888 10889 # Merge query 10890 transcripts_tmp2_query = f""" 10891 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10892 FROM ({query_merge}) AS {transcript_table_tmp} 10893 {query_left_join} 10894 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10895 """ 10896 10897 # Retrive columns after mege 10898 transcripts_tmp2_describe_query = f""" 10899 DESCRIBE {transcripts_tmp2_query} 10900 """ 10901 transcripts_tmp2_describe_list = list( 10902 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10903 "column_name" 10904 ] 10905 ) 10906 10907 # Create list of columns for select clause 10908 transcripts_tmp2_describe_select_clause = [] 10909 for field in transcripts_tmp2_describe_list: 10910 if field not in [ 10911 "#CHROM", 10912 "POS", 10913 "REF", 10914 "ALT", 10915 "INFO", 10916 "transcript_mapped", 10917 ]: 10918 as_field = field 10919 if field in ["transcript_original"]: 10920 as_field = "transcripts_mapped" 10921 transcripts_tmp2_describe_select_clause.append( 10922 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 10923 ) 10924 10925 # Merge with mapping 10926 query_merge_on_transcripts = f""" 10927 SELECT 10928 "#CHROM", POS, REF, ALT, INFO, 10929 CASE 10930 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 10931 THEN ANY_VALUE(transcript_mapped) 10932 ELSE ANY_VALUE(transcript_original) 10933 END AS transcript, 10934 {", ".join(transcripts_tmp2_describe_select_clause)} 10935 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 10936 GROUP BY "#CHROM", POS, REF, ALT, INFO, 10937 {query_transcript_merge_group_by} 10938 """ 10939 10940 # Add transcript filter from mapping file 10941 if transcript_id_mapping_force: 10942 query_merge_on_transcripts = f""" 10943 SELECT * 10944 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 10945 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 10946 """ 10947 10948 # No transcript mapping 10949 else: 10950 10951 # Remove transcript version 10952 if transcript_id_remove_version: 10953 query_transcript_column = f""" 10954 split_part({transcript_table_tmp}.transcript, '.', 1) 10955 """ 10956 else: 10957 query_transcript_column = """ 10958 transcript 10959 """ 10960 10961 # Query sections 10962 query_transcript_column_select = ( 10963 f"{query_transcript_column} AS transcript" 10964 ) 10965 query_transcript_column_group_by = query_transcript_column 10966 10967 # Query for transcripts view 10968 query_merge_on_transcripts = f""" 10969 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 10970 FROM ({query_merge}) AS {transcript_table_tmp} 10971 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 10972 """ 10973 10974 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 10975 10976 # Drop transcript view is necessary 10977 if transcripts_table_drop: 10978 query_drop = f""" 10979 DROP TABLE IF EXISTS {transcripts_table}; 10980 """ 10981 self.execute_query(query=query_drop) 10982 10983 # Merge and create transcript view 10984 query_create_view = f""" 10985 CREATE TABLE IF NOT EXISTS {transcripts_table} 10986 AS {query_merge_on_transcripts} 10987 """ 10988 self.execute_query(query=query_create_view) 10989 10990 # Remove added columns 10991 for added_column in added_columns: 10992 self.drop_column(column=added_column) 10993 10994 else: 10995 10996 transcripts_table = None 10997 10998 return transcripts_table 10999 11000 def annotation_format_to_table( 11001 self, 11002 uniquify: bool = True, 11003 annotation_field: str = "ANN", 11004 annotation_id: str = "Feature_ID", 11005 view_name: str = "transcripts", 11006 column_rename: dict = {}, 11007 column_clean: bool = False, 11008 column_case: str = None, 11009 ) -> str: 11010 """ 11011 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11012 structured table format, ensuring unique values and creating a temporary table for further 11013 processing or analysis. 11014 11015 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11016 unique values in the output or not. If set to `True`, the function will make sure that the 11017 output values are unique, defaults to True 11018 :type uniquify: bool (optional) 11019 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11020 that contains the annotation information for each variant. This field is used to extract the 11021 annotation details for further processing in the function. By default, it is set to "ANN", 11022 defaults to ANN 11023 :type annotation_field: str (optional) 11024 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11025 is used to specify the identifier for the annotation feature. This identifier will be used as a 11026 column name in the resulting table or view that is created based on the annotation data. It 11027 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11028 :type annotation_id: str (optional) 11029 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11030 to specify the name of the temporary table that will be created to store the transformed 11031 annotation data. This table will hold the extracted information from the annotation field in a 11032 structured format for further processing or analysis. By default,, defaults to transcripts 11033 :type view_name: str (optional) 11034 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11035 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11036 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11037 created based on the annotation data. This feature enables 11038 :type column_rename: dict 11039 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11040 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11041 If set to `True`, the function will clean the annotation field before further processing. This 11042 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11043 to False 11044 :type column_clean: bool (optional) 11045 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11046 used to specify the case transformation to be applied to the column names extracted from the 11047 annotation data. It allows you to set the case of the column names to either lowercase or 11048 uppercase for consistency or other specific requirements during the conversion 11049 :type column_case: str 11050 :return: The function `annotation_format_to_table` is returning the name of the view created, 11051 which is stored in the variable `view_name`. 11052 """ 11053 11054 # Annotation field 11055 annotation_format = "annotation_explode" 11056 11057 # Transcript annotation 11058 if column_rename: 11059 annotation_id = column_rename.get(annotation_id, annotation_id) 11060 11061 if column_clean: 11062 annotation_id = clean_annotation_field(annotation_id) 11063 11064 # Prefix 11065 prefix = self.get_explode_infos_prefix() 11066 if prefix: 11067 prefix = "INFO/" 11068 11069 # Annotation fields 11070 annotation_infos = prefix + annotation_field 11071 annotation_format_infos = prefix + annotation_format 11072 11073 # Variants table 11074 table_variants = self.get_table_variants() 11075 11076 # Header 11077 vcf_reader = self.get_header() 11078 11079 # Add columns 11080 added_columns = [] 11081 11082 # Explode HGVS field in column 11083 added_columns += self.explode_infos(fields=[annotation_field]) 11084 11085 if annotation_field in vcf_reader.infos: 11086 11087 # Extract ANN header 11088 ann_description = vcf_reader.infos[annotation_field].desc 11089 pattern = r"'(.+?)'" 11090 match = re.search(pattern, ann_description) 11091 if match: 11092 ann_header_match = match.group(1).split(" | ") 11093 ann_header = [] 11094 ann_header_desc = {} 11095 for i in range(len(ann_header_match)): 11096 ann_header_info = "".join( 11097 char for char in ann_header_match[i] if char.isalnum() 11098 ) 11099 ann_header.append(ann_header_info) 11100 ann_header_desc[ann_header_info] = ann_header_match[i] 11101 if not ann_header_desc: 11102 raise ValueError("Invalid header description format") 11103 else: 11104 raise ValueError("Invalid header description format") 11105 11106 # Create variant id 11107 variant_id_column = self.get_variant_id_column() 11108 added_columns += [variant_id_column] 11109 11110 # Create dataframe 11111 dataframe_annotation_format = self.get_query_to_df( 11112 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11113 ) 11114 11115 # Create annotation columns 11116 dataframe_annotation_format[ 11117 annotation_format_infos 11118 ] = dataframe_annotation_format[annotation_infos].apply( 11119 lambda x: explode_annotation_format( 11120 annotation=str(x), 11121 uniquify=uniquify, 11122 output_format="JSON", 11123 prefix="", 11124 header=list(ann_header_desc.values()), 11125 ) 11126 ) 11127 11128 # Find keys 11129 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11130 df_keys = self.get_query_to_df(query=query_json) 11131 11132 # Check keys 11133 query_json_key = [] 11134 for _, row in df_keys.iterrows(): 11135 11136 # Key 11137 key = row.iloc[0] 11138 key_clean = key 11139 11140 # key rename 11141 if column_rename: 11142 key_clean = column_rename.get(key_clean, key_clean) 11143 11144 # key clean 11145 if column_clean: 11146 key_clean = clean_annotation_field(key_clean) 11147 11148 # Key case 11149 if column_case: 11150 if column_case.lower() in ["lower"]: 11151 key_clean = key_clean.lower() 11152 elif column_case.lower() in ["upper"]: 11153 key_clean = key_clean.upper() 11154 11155 # Type 11156 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11157 11158 # Get DataFrame from query 11159 df_json_type = self.get_query_to_df(query=query_json_type) 11160 11161 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11162 with pd.option_context("future.no_silent_downcasting", True): 11163 df_json_type.fillna(value="", inplace=True) 11164 replace_dict = {None: np.nan, "": np.nan} 11165 df_json_type.replace(replace_dict, inplace=True) 11166 df_json_type.dropna(inplace=True) 11167 11168 # Detect column type 11169 column_type = detect_column_type(df_json_type[key_clean]) 11170 11171 # Append 11172 query_json_key.append( 11173 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11174 ) 11175 11176 # Create view 11177 query_view = f""" 11178 CREATE TEMPORARY TABLE {view_name} 11179 AS ( 11180 SELECT *, {annotation_id} AS 'transcript' 11181 FROM ( 11182 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11183 FROM dataframe_annotation_format 11184 ) 11185 ); 11186 """ 11187 self.execute_query(query=query_view) 11188 11189 else: 11190 11191 # Return None 11192 view_name = None 11193 11194 # Remove added columns 11195 for added_column in added_columns: 11196 self.drop_column(column=added_column) 11197 11198 return view_name 11199 11200 def transcript_view_to_variants( 11201 self, 11202 transcripts_table: str = None, 11203 transcripts_column_id: str = None, 11204 transcripts_info_json: str = None, 11205 transcripts_info_field_json: str = None, 11206 transcripts_info_format: str = None, 11207 transcripts_info_field_format: str = None, 11208 param: dict = {}, 11209 ) -> bool: 11210 """ 11211 The `transcript_view_to_variants` function updates a variants table with information from 11212 transcripts in JSON format. 11213 11214 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11215 table containing the transcripts data. If this parameter is not provided, the function will 11216 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11217 :type transcripts_table: str 11218 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11219 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11220 identifier is used to match transcripts with variants in the database 11221 :type transcripts_column_id: str 11222 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11223 of the column in the variants table where the transcripts information will be stored in JSON 11224 format. This parameter allows you to define the column in the variants table that will hold the 11225 JSON-formatted information about transcripts 11226 :type transcripts_info_json: str 11227 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11228 specify the field in the VCF header that will contain information about transcripts in JSON 11229 format. This field will be added to the VCF header as an INFO field with the specified name 11230 :type transcripts_info_field_json: str 11231 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11232 format of the information about transcripts that will be stored in the variants table. This 11233 format can be used to define how the transcript information will be structured or displayed 11234 within the variants table 11235 :type transcripts_info_format: str 11236 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11237 specify the field in the VCF header that will contain information about transcripts in a 11238 specific format. This field will be added to the VCF header as an INFO field with the specified 11239 name 11240 :type transcripts_info_field_format: str 11241 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11242 that contains various configuration settings related to transcripts. It is used to provide 11243 default values for certain parameters if they are not explicitly provided when calling the 11244 method. The `param` dictionary can be passed as an argument 11245 :type param: dict 11246 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11247 if the operation is successful and `False` if certain conditions are not met. 11248 """ 11249 11250 msg_info_prefix = "Start transcripts view to variants annotations" 11251 11252 log.debug(f"{msg_info_prefix}...") 11253 11254 # Default 11255 transcripts_table_default = "transcripts" 11256 transcripts_column_id_default = "transcript" 11257 transcripts_info_json_default = None 11258 transcripts_info_format_default = None 11259 transcripts_info_field_json_default = None 11260 transcripts_info_field_format_default = None 11261 11262 # Param 11263 if not param: 11264 param = self.get_param() 11265 11266 # Transcripts table 11267 if transcripts_table is None: 11268 transcripts_table = param.get("transcripts", {}).get( 11269 "table", transcripts_table_default 11270 ) 11271 11272 # Transcripts column ID 11273 if transcripts_column_id is None: 11274 transcripts_column_id = param.get("transcripts", {}).get( 11275 "column_id", transcripts_column_id_default 11276 ) 11277 11278 # Transcripts info json 11279 if transcripts_info_json is None: 11280 transcripts_info_json = param.get("transcripts", {}).get( 11281 "transcripts_info_json", transcripts_info_json_default 11282 ) 11283 11284 # Transcripts info field JSON 11285 if transcripts_info_field_json is None: 11286 transcripts_info_field_json = param.get("transcripts", {}).get( 11287 "transcripts_info_field_json", transcripts_info_field_json_default 11288 ) 11289 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11290 # transcripts_info_json = transcripts_info_field_json 11291 11292 # Transcripts info format 11293 if transcripts_info_format is None: 11294 transcripts_info_format = param.get("transcripts", {}).get( 11295 "transcripts_info_format", transcripts_info_format_default 11296 ) 11297 11298 # Transcripts info field FORMAT 11299 if transcripts_info_field_format is None: 11300 transcripts_info_field_format = param.get("transcripts", {}).get( 11301 "transcripts_info_field_format", transcripts_info_field_format_default 11302 ) 11303 # if ( 11304 # transcripts_info_field_format is not None 11305 # and transcripts_info_format is None 11306 # ): 11307 # transcripts_info_format = transcripts_info_field_format 11308 11309 # Variants table 11310 table_variants = self.get_table_variants() 11311 11312 # Check info columns param 11313 if ( 11314 transcripts_info_json is None 11315 and transcripts_info_field_json is None 11316 and transcripts_info_format is None 11317 and transcripts_info_field_format is None 11318 ): 11319 return False 11320 11321 # Transcripts infos columns 11322 query_transcripts_infos_columns = f""" 11323 SELECT * 11324 FROM ( 11325 DESCRIBE SELECT * FROM {transcripts_table} 11326 ) 11327 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11328 """ 11329 transcripts_infos_columns = list( 11330 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11331 ) 11332 11333 # View results 11334 clause_select = [] 11335 clause_to_json = [] 11336 clause_to_format = [] 11337 for field in transcripts_infos_columns: 11338 # Do not consider INFO field for export into fields 11339 if field not in ["INFO"]: 11340 clause_select.append( 11341 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11342 ) 11343 clause_to_json.append(f""" '{field}': "{field}" """) 11344 clause_to_format.append(f""" "{field}" """) 11345 11346 # Update 11347 update_set_json = [] 11348 update_set_format = [] 11349 11350 # VCF header 11351 vcf_reader = self.get_header() 11352 11353 # Transcripts to info column in JSON 11354 if transcripts_info_json: 11355 11356 # Create column on variants table 11357 self.add_column( 11358 table_name=table_variants, 11359 column_name=transcripts_info_json, 11360 column_type="JSON", 11361 default_value=None, 11362 drop=False, 11363 ) 11364 11365 # Add header 11366 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11367 transcripts_info_json, 11368 ".", 11369 "String", 11370 "Transcripts in JSON format", 11371 "unknwon", 11372 "unknwon", 11373 self.code_type_map["String"], 11374 ) 11375 11376 # Add to update 11377 update_set_json.append( 11378 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11379 ) 11380 11381 # Transcripts to info field in JSON 11382 if transcripts_info_field_json: 11383 11384 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11385 11386 # Add to update 11387 update_set_json.append( 11388 f""" 11389 INFO = concat( 11390 CASE 11391 WHEN INFO NOT IN ('', '.') 11392 THEN INFO 11393 ELSE '' 11394 END, 11395 CASE 11396 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11397 THEN concat( 11398 ';{transcripts_info_field_json}=', 11399 t.{transcripts_info_json} 11400 ) 11401 ELSE '' 11402 END 11403 ) 11404 """ 11405 ) 11406 11407 # Add header 11408 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11409 transcripts_info_field_json, 11410 ".", 11411 "String", 11412 "Transcripts in JSON format", 11413 "unknwon", 11414 "unknwon", 11415 self.code_type_map["String"], 11416 ) 11417 11418 if update_set_json: 11419 11420 # Update query 11421 query_update = f""" 11422 UPDATE {table_variants} 11423 SET {", ".join(update_set_json)} 11424 FROM 11425 ( 11426 SELECT 11427 "#CHROM", POS, REF, ALT, 11428 concat( 11429 '{{', 11430 string_agg( 11431 '"' || "{transcripts_column_id}" || '":' || 11432 to_json(json_output) 11433 ), 11434 '}}' 11435 )::JSON AS {transcripts_info_json} 11436 FROM 11437 ( 11438 SELECT 11439 "#CHROM", POS, REF, ALT, 11440 "{transcripts_column_id}", 11441 to_json( 11442 {{{",".join(clause_to_json)}}} 11443 )::JSON AS json_output 11444 FROM 11445 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11446 WHERE "{transcripts_column_id}" IS NOT NULL 11447 ) 11448 GROUP BY "#CHROM", POS, REF, ALT 11449 ) AS t 11450 WHERE {table_variants}."#CHROM" = t."#CHROM" 11451 AND {table_variants}."POS" = t."POS" 11452 AND {table_variants}."REF" = t."REF" 11453 AND {table_variants}."ALT" = t."ALT" 11454 """ 11455 11456 self.execute_query(query=query_update) 11457 11458 # Transcripts to info column in FORMAT 11459 if transcripts_info_format: 11460 11461 # Create column on variants table 11462 self.add_column( 11463 table_name=table_variants, 11464 column_name=transcripts_info_format, 11465 column_type="VARCHAR", 11466 default_value=None, 11467 drop=False, 11468 ) 11469 11470 # Add header 11471 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11472 transcripts_info_format, 11473 ".", 11474 "String", 11475 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11476 "unknwon", 11477 "unknwon", 11478 self.code_type_map["String"], 11479 ) 11480 11481 # Add to update 11482 update_set_format.append( 11483 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11484 ) 11485 11486 else: 11487 11488 # Set variable for internal queries 11489 transcripts_info_format = "transcripts_info_format" 11490 11491 # Transcripts to info field in JSON 11492 if transcripts_info_field_format: 11493 11494 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11495 11496 # Add to update 11497 update_set_format.append( 11498 f""" 11499 INFO = concat( 11500 CASE 11501 WHEN INFO NOT IN ('', '.') 11502 THEN INFO 11503 ELSE '' 11504 END, 11505 CASE 11506 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11507 THEN concat( 11508 ';{transcripts_info_field_format}=', 11509 t.{transcripts_info_format} 11510 ) 11511 ELSE '' 11512 END 11513 ) 11514 """ 11515 ) 11516 11517 # Add header 11518 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11519 transcripts_info_field_format, 11520 ".", 11521 "String", 11522 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11523 "unknwon", 11524 "unknwon", 11525 self.code_type_map["String"], 11526 ) 11527 11528 if update_set_format: 11529 11530 # Update query 11531 query_update = f""" 11532 UPDATE {table_variants} 11533 SET {", ".join(update_set_format)} 11534 FROM 11535 ( 11536 SELECT 11537 "#CHROM", POS, REF, ALT, 11538 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11539 FROM 11540 ( 11541 SELECT 11542 "#CHROM", POS, REF, ALT, 11543 "{transcripts_column_id}", 11544 concat( 11545 "{transcripts_column_id}", 11546 '|', 11547 {", '|', ".join(clause_to_format)} 11548 ) AS {transcripts_info_format} 11549 FROM 11550 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11551 ) 11552 GROUP BY "#CHROM", POS, REF, ALT 11553 ) AS t 11554 WHERE {table_variants}."#CHROM" = t."#CHROM" 11555 AND {table_variants}."POS" = t."POS" 11556 AND {table_variants}."REF" = t."REF" 11557 AND {table_variants}."ALT" = t."ALT" 11558 """ 11559 11560 self.execute_query(query=query_update) 11561 11562 return True
38 def __init__( 39 self, 40 conn=None, 41 input: str = None, 42 output: str = None, 43 config: dict = {}, 44 param: dict = {}, 45 load: bool = False, 46 ) -> None: 47 """ 48 The function `__init__` initializes the variables, sets the input, output, config, param, connexion and 49 header 50 51 :param conn: the connection to the database 52 :param input: the input file 53 :param output: the output file 54 :param config: a dictionary containing the configuration of the model 55 :param param: a dictionary containing the parameters of the model 56 """ 57 58 # Init variables 59 self.init_variables() 60 61 # Input 62 self.set_input(input) 63 64 # Config 65 self.set_config(config) 66 67 # Param 68 self.set_param(param) 69 70 # Output 71 self.set_output(output) 72 73 # connexion 74 self.set_connexion(conn) 75 76 # Header 77 self.set_header() 78 79 # Samples 80 self.set_samples() 81 82 # Load data 83 if load: 84 self.load_data()
The function __init__ initializes the variables, sets the input, output, config, param, connexion and
header
Parameters
- conn: the connection to the database
- input: the input file
- output: the output file
- config: a dictionary containing the configuration of the model
- param: a dictionary containing the parameters of the model
86 def set_samples(self, samples: list = None) -> list: 87 """ 88 The function `set_samples` sets the samples attribute of an object to a provided list or 89 retrieves it from a parameter dictionary. 90 91 :param samples: The `set_samples` method is a method of a class that takes a list of samples as 92 input and sets the `samples` attribute of the class to the provided list. If no samples are 93 provided, it tries to get the samples from the class's parameters using the `get_param` method 94 :type samples: list 95 :return: The `samples` list is being returned. 96 """ 97 98 if not samples: 99 samples = self.get_param().get("samples", {}).get("list", None) 100 101 self.samples = samples 102 103 return samples
The function set_samples sets the samples attribute of an object to a provided list or
retrieves it from a parameter dictionary.
Parameters
- samples: The
set_samplesmethod is a method of a class that takes a list of samples as input and sets thesamplesattribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using theget_parammethod
Returns
The
sampleslist is being returned.
105 def get_samples(self) -> list: 106 """ 107 This function returns a list of samples. 108 :return: The `get_samples` method is returning the `samples` attribute of the object. 109 """ 110 111 return self.samples
This function returns a list of samples.
Returns
The
get_samplesmethod is returning thesamplesattribute of the object.
113 def get_samples_check(self) -> bool: 114 """ 115 This function returns the value of the "check" key within the "samples" dictionary retrieved 116 from the parameters. 117 :return: The method `get_samples_check` is returning the value of the key "check" inside the 118 "samples" dictionary, which is nested inside the dictionary returned by the `get_param()` 119 method. If the key "check" is not found, it will return `False`. 120 """ 121 122 return self.get_param().get("samples", {}).get("check", True)
This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.
Returns
The method
get_samples_checkis returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by theget_param()method. If the key "check" is not found, it will returnFalse.
124 def set_input(self, input: str = None) -> None: 125 """ 126 The function `set_input` takes a file name as input, extracts the name and extension, and sets 127 attributes in the class accordingly. 128 129 :param input: The `set_input` method in the provided code snippet is used to set attributes 130 related to the input file. Here's a breakdown of the parameters and their usage in the method: 131 :type input: str 132 """ 133 134 if input and not isinstance(input, str): 135 try: 136 self.input = input.name 137 except: 138 log.error(f"Input file '{input} in bad format") 139 raise ValueError(f"Input file '{input} in bad format") 140 else: 141 self.input = input 142 143 # Input format 144 if input: 145 input_name, input_extension = os.path.splitext(self.input) 146 self.input_name = input_name 147 self.input_extension = input_extension 148 self.input_format = self.input_extension.replace(".", "")
The function set_input takes a file name as input, extracts the name and extension, and sets
attributes in the class accordingly.
Parameters
- input: The
set_inputmethod in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
150 def set_config(self, config: dict) -> None: 151 """ 152 The set_config function takes a config object and assigns it as the configuration object for the 153 class. 154 155 :param config: The `config` parameter in the `set_config` function is a dictionary object that 156 contains configuration settings for the class. When you call the `set_config` function with a 157 dictionary object as the argument, it will set that dictionary as the configuration object for 158 the class 159 :type config: dict 160 """ 161 162 self.config = config
The set_config function takes a config object and assigns it as the configuration object for the class.
Parameters
- config: The
configparameter in theset_configfunction is a dictionary object that contains configuration settings for the class. When you call theset_configfunction with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
164 def set_param(self, param: dict) -> None: 165 """ 166 This function sets a parameter object for the class based on the input dictionary. 167 168 :param param: The `set_param` method you provided takes a dictionary object as input and sets it 169 as the `param` attribute of the class instance 170 :type param: dict 171 """ 172 173 self.param = param
This function sets a parameter object for the class based on the input dictionary.
Parameters
- param: The
set_parammethod you provided takes a dictionary object as input and sets it as theparamattribute of the class instance
175 def init_variables(self) -> None: 176 """ 177 This function initializes the variables that will be used in the rest of the class 178 """ 179 180 self.prefix = "howard" 181 self.table_variants = "variants" 182 self.dataframe = None 183 184 self.comparison_map = { 185 "gt": ">", 186 "gte": ">=", 187 "lt": "<", 188 "lte": "<=", 189 "equals": "=", 190 "contains": "SIMILAR TO", 191 } 192 193 self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3} 194 195 self.code_type_map_to_sql = { 196 "Integer": "INTEGER", 197 "String": "VARCHAR", 198 "Float": "FLOAT", 199 "Flag": "VARCHAR", 200 } 201 202 self.index_additionnal_fields = []
This function initializes the variables that will be used in the rest of the class
204 def get_indexing(self) -> bool: 205 """ 206 It returns the value of the key "indexing" in the dictionary. If the key is not present, it 207 returns False. 208 :return: The value of the indexing parameter. 209 """ 210 211 return self.get_param().get("indexing", False)
It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.
Returns
The value of the indexing parameter.
213 def get_connexion_config(self) -> dict: 214 """ 215 The function `get_connexion_config` returns a dictionary containing the configuration for a 216 connection, including the number of threads and memory limit. 217 :return: a dictionary containing the configuration for the Connexion library. 218 """ 219 220 # config 221 config = self.get_config() 222 223 # Connexion config 224 connexion_config = {} 225 threads = self.get_threads() 226 227 # Threads 228 if threads: 229 connexion_config["threads"] = threads 230 231 # Memory 232 # if config.get("memory", None): 233 # connexion_config["memory_limit"] = config.get("memory") 234 if self.get_memory(): 235 connexion_config["memory_limit"] = self.get_memory() 236 237 # Temporary directory 238 if config.get("tmp", None): 239 connexion_config["temp_directory"] = config.get("tmp") 240 241 # Access 242 if config.get("access", None): 243 access = config.get("access") 244 if access in ["RO"]: 245 access = "READ_ONLY" 246 elif access in ["RW"]: 247 access = "READ_WRITE" 248 connexion_db = self.get_connexion_db() 249 if connexion_db in ":memory:": 250 access = "READ_WRITE" 251 connexion_config["access_mode"] = access 252 253 return connexion_config
The function get_connexion_config returns a dictionary containing the configuration for a
connection, including the number of threads and memory limit.
Returns
a dictionary containing the configuration for the Connexion library.
255 def get_duckdb_settings(self) -> dict: 256 """ 257 The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a 258 string. 259 :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`. 260 """ 261 262 # config 263 config = self.get_config() 264 265 # duckdb settings 266 duckdb_settings_dict = {} 267 if config.get("duckdb_settings", None): 268 duckdb_settings = config.get("duckdb_settings") 269 duckdb_settings = full_path(duckdb_settings) 270 # duckdb setting is a file 271 if os.path.exists(duckdb_settings): 272 with open(duckdb_settings) as json_file: 273 duckdb_settings_dict = yaml.safe_load(json_file) 274 # duckdb settings is a string 275 else: 276 duckdb_settings_dict = json.loads(duckdb_settings) 277 278 return duckdb_settings_dict
The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a
string.
Returns
The function
get_duckdb_settingsreturns a dictionary objectduckdb_settings_dict.
280 def set_connexion_db(self) -> str: 281 """ 282 The function `set_connexion_db` returns the appropriate database connection string based on the 283 input format and connection type. 284 :return: the value of the variable `connexion_db`. 285 """ 286 287 # Default connexion db 288 default_connexion_db = ":memory:" 289 290 # Find connexion db 291 if self.get_input_format() in ["db", "duckdb"]: 292 connexion_db = self.get_input() 293 elif self.get_connexion_type() in ["memory", default_connexion_db, None]: 294 connexion_db = default_connexion_db 295 elif self.get_connexion_type() in ["tmpfile"]: 296 tmp_name = tempfile.mkdtemp( 297 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db" 298 ) 299 connexion_db = f"{tmp_name}/tmp.db" 300 elif self.get_connexion_type() != "": 301 connexion_db = self.get_connexion_type() 302 else: 303 connexion_db = default_connexion_db 304 305 # Set connexion db 306 self.connexion_db = connexion_db 307 308 return connexion_db
The function set_connexion_db returns the appropriate database connection string based on the
input format and connection type.
Returns
the value of the variable
connexion_db.
310 def set_connexion(self, conn) -> None: 311 """ 312 The function `set_connexion` creates a connection to a database, with options for different 313 database formats and settings. 314 315 :param conn: The `conn` parameter in the `set_connexion` method is the connection to the 316 database. If a connection is not provided, a new connection to an in-memory database is created. 317 The method then proceeds to set up the connection based on the specified format (e.g., duckdb or 318 sqlite 319 """ 320 321 # Connexion db 322 connexion_db = self.set_connexion_db() 323 324 # Connexion config 325 connexion_config = self.get_connexion_config() 326 327 # Connexion format 328 connexion_format = self.get_config().get("connexion_format", "duckdb") 329 # Set connexion format 330 self.connexion_format = connexion_format 331 332 # Connexion 333 if not conn: 334 if connexion_format in ["duckdb"]: 335 conn = duckdb.connect(connexion_db, config=connexion_config) 336 # duckDB settings 337 duckdb_settings = self.get_duckdb_settings() 338 if duckdb_settings: 339 for setting in duckdb_settings: 340 setting_value = duckdb_settings.get(setting) 341 if isinstance(setting_value, str): 342 setting_value = f"'{setting_value}'" 343 conn.execute(f"PRAGMA {setting}={setting_value};") 344 elif connexion_format in ["sqlite"]: 345 conn = sqlite3.connect(connexion_db) 346 347 # Set connexion 348 self.conn = conn 349 350 # Log 351 log.debug(f"connexion_format: {connexion_format}") 352 log.debug(f"connexion_db: {connexion_db}") 353 log.debug(f"connexion config: {connexion_config}") 354 log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
The function set_connexion creates a connection to a database, with options for different
database formats and settings.
Parameters
- conn: The
connparameter in theset_connexionmethod is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
356 def set_output(self, output: str = None) -> None: 357 """ 358 The `set_output` function in Python sets the output file based on the input or a specified key 359 in the config file, extracting the output name, extension, and format. 360 361 :param output: The `output` parameter in the `set_output` method is used to specify the name of 362 the output file. If the config file has an 'output' key, the method sets the output to the value 363 of that key. If no output is provided, it sets the output to `None` 364 :type output: str 365 """ 366 367 if output and not isinstance(output, str): 368 self.output = output.name 369 else: 370 self.output = output 371 372 # Output format 373 if self.output: 374 output_name, output_extension = os.path.splitext(self.output) 375 self.output_name = output_name 376 self.output_extension = output_extension 377 self.output_format = self.output_extension.replace(".", "") 378 else: 379 self.output_name = None 380 self.output_extension = None 381 self.output_format = None
The set_output function in Python sets the output file based on the input or a specified key
in the config file, extracting the output name, extension, and format.
Parameters
- output: The
outputparameter in theset_outputmethod is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output toNone
383 def set_header(self) -> None: 384 """ 385 It reads the header of a VCF file and stores it as a list of strings and as a VCF object 386 """ 387 388 input_file = self.get_input() 389 default_header_list = [ 390 "##fileformat=VCFv4.2", 391 "#CHROM POS ID REF ALT QUAL FILTER INFO", 392 ] 393 394 # Full path 395 input_file = full_path(input_file) 396 397 if input_file: 398 399 input_format = self.get_input_format() 400 input_compressed = self.get_input_compressed() 401 config = self.get_config() 402 header_list = default_header_list 403 if input_format in [ 404 "vcf", 405 "hdr", 406 "tsv", 407 "csv", 408 "psv", 409 "parquet", 410 "db", 411 "duckdb", 412 ]: 413 # header provided in param 414 if config.get("header_file", None): 415 with open(config.get("header_file"), "rt") as f: 416 header_list = self.read_vcf_header(f) 417 # within a vcf file format (header within input file itsself) 418 elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file): 419 # within a compressed vcf file format (.vcf.gz) 420 if input_compressed: 421 with bgzf.open(input_file, "rt") as f: 422 header_list = self.read_vcf_header(f) 423 # within an uncompressed vcf file format (.vcf) 424 else: 425 with open(input_file, "rt") as f: 426 header_list = self.read_vcf_header(f) 427 # header provided in default external file .hdr 428 elif os.path.exists((input_file + ".hdr")): 429 with open(input_file + ".hdr", "rt") as f: 430 header_list = self.read_vcf_header(f) 431 else: 432 try: # Try to get header info fields and file columns 433 434 with tempfile.TemporaryDirectory() as tmpdir: 435 436 # Create database 437 db_for_header = Database(database=input_file) 438 439 # Get header columns for infos fields 440 db_header_from_columns = ( 441 db_for_header.get_header_from_columns() 442 ) 443 444 # Get real columns in the file 445 db_header_columns = db_for_header.get_columns() 446 447 # Write header file 448 header_file_tmp = os.path.join(tmpdir, "header") 449 f = open(header_file_tmp, "w") 450 vcf.Writer(f, db_header_from_columns) 451 f.close() 452 453 # Replace #CHROM line with rel columns 454 header_list = db_for_header.read_header_file( 455 header_file=header_file_tmp 456 ) 457 header_list[-1] = "\t".join(db_header_columns) 458 459 except: 460 461 log.warning( 462 f"No header for file {input_file}. Set as default VCF header" 463 ) 464 header_list = default_header_list 465 466 else: # try for unknown format ? 467 468 log.error(f"Input file format '{input_format}' not available") 469 raise ValueError(f"Input file format '{input_format}' not available") 470 471 if not header_list: 472 header_list = default_header_list 473 474 # header as list 475 self.header_list = header_list 476 477 # header as VCF object 478 self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list))) 479 480 else: 481 482 self.header_list = None 483 self.header_vcf = None
It reads the header of a VCF file and stores it as a list of strings and as a VCF object
485 def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame: 486 """ 487 The `get_query_to_df` function takes a query as a string and returns the result as a pandas 488 DataFrame based on the connection format. 489 490 :param query: The `query` parameter in the `get_query_to_df` function is a string that 491 represents the SQL query you want to execute. This query will be used to fetch data from a 492 database and convert it into a pandas DataFrame 493 :type query: str 494 :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the 495 maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the 496 function will only fetch up to that number of rows from the database query result. If no limit 497 is specified, 498 :type limit: int 499 :return: A pandas DataFrame is being returned by the `get_query_to_df` function. 500 """ 501 502 # Connexion format 503 connexion_format = self.get_connexion_format() 504 505 # Limit in query 506 if limit: 507 pd.set_option("display.max_rows", limit) 508 if connexion_format in ["duckdb"]: 509 df = ( 510 self.conn.execute(query) 511 .fetch_record_batch(limit) 512 .read_next_batch() 513 .to_pandas() 514 ) 515 elif connexion_format in ["sqlite"]: 516 df = next(pd.read_sql_query(query, self.conn, chunksize=limit)) 517 518 # Full query 519 else: 520 if connexion_format in ["duckdb"]: 521 df = self.conn.execute(query).df() 522 elif connexion_format in ["sqlite"]: 523 df = pd.read_sql_query(query, self.conn) 524 525 return df
The get_query_to_df function takes a query as a string and returns the result as a pandas
DataFrame based on the connection format.
Parameters
- query: The
queryparameter in theget_query_to_dffunction is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame - limit: The
limitparameter in theget_query_to_dffunction is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns
A pandas DataFrame is being returned by the
get_query_to_dffunction.
527 def get_overview(self) -> None: 528 """ 529 The function prints the input, output, config, and dataframe of the current object 530 """ 531 table_variants_from = self.get_table_variants(clause="from") 532 sql_columns = self.get_header_columns_as_sql() 533 sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}" 534 df = self.get_query_to_df(sql_query_export) 535 log.info( 536 "Input: " 537 + str(self.get_input()) 538 + " [" 539 + str(str(self.get_input_format())) 540 + "]" 541 ) 542 log.info( 543 "Output: " 544 + str(self.get_output()) 545 + " [" 546 + str(str(self.get_output_format())) 547 + "]" 548 ) 549 log.info("Config: ") 550 for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split( 551 "\n" 552 ): 553 log.info("\t" + str(d)) 554 log.info("Param: ") 555 for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split( 556 "\n" 557 ): 558 log.info("\t" + str(d)) 559 log.info("Sample list: " + str(self.get_header_sample_list())) 560 log.info("Dataframe: ") 561 for d in str(df).split("\n"): 562 log.info("\t" + str(d)) 563 564 # garbage collector 565 del df 566 gc.collect() 567 568 return None
The function prints the input, output, config, and dataframe of the current object
570 def get_stats(self) -> dict: 571 """ 572 The `get_stats` function calculates and returns various statistics of the current object, 573 including information about the input file, variants, samples, header fields, quality, and 574 SNVs/InDels. 575 :return: a dictionary containing various statistics of the current object. The dictionary has 576 the following structure: 577 """ 578 579 # Log 580 log.info(f"Stats Calculation...") 581 582 # table varaints 583 table_variants_from = self.get_table_variants() 584 585 # stats dict 586 stats = {"Infos": {}} 587 588 ### File 589 input_file = self.get_input() 590 stats["Infos"]["Input file"] = input_file 591 592 # Header 593 header_infos = self.get_header().infos 594 header_formats = self.get_header().formats 595 header_infos_list = list(header_infos) 596 header_formats_list = list(header_formats) 597 598 ### Variants 599 600 stats["Variants"] = {} 601 602 # Variants by chr 603 sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"' 604 df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom) 605 nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values( 606 by=["CHROM"], kind="quicksort" 607 ) 608 609 # Total number of variants 610 nb_of_variants = nb_of_variants_by_chrom["count"].sum() 611 612 # Calculate percentage 613 nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply( 614 lambda x: (x / nb_of_variants) 615 ) 616 617 stats["Variants"]["Number of variants by chromosome"] = ( 618 nb_of_variants_by_chrom.to_dict(orient="index") 619 ) 620 621 stats["Infos"]["Number of variants"] = int(nb_of_variants) 622 623 ### Samples 624 625 # Init 626 samples = {} 627 nb_of_samples = 0 628 629 # Check Samples 630 if "GT" in header_formats_list and "FORMAT" in self.get_header_columns(): 631 log.debug(f"Check samples...") 632 for sample in self.get_header_sample_list(): 633 sql_query_samples = f""" 634 SELECT '{sample}' as sample, 635 REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype, 636 count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count, 637 concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage 638 FROM {table_variants_from} 639 WHERE ( 640 regexp_matches("{sample}", '^[0-9]([/|][0-9])+') 641 AND 642 len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':')) 643 ) 644 GROUP BY genotype 645 """ 646 sql_query_genotype_df = self.conn.execute(sql_query_samples).df() 647 sample_genotype_count = sql_query_genotype_df["count"].sum() 648 if len(sql_query_genotype_df): 649 nb_of_samples += 1 650 samples[f"{sample} - {sample_genotype_count} variants"] = ( 651 sql_query_genotype_df.to_dict(orient="index") 652 ) 653 654 stats["Samples"] = samples 655 stats["Infos"]["Number of samples"] = nb_of_samples 656 657 # # 658 # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list: 659 # stats["Infos"]["Number of samples"] = nb_of_samples 660 # elif nb_of_samples: 661 # stats["Infos"]["Number of samples"] = "not a VCF format" 662 663 ### INFO and FORMAT fields 664 header_types_df = {} 665 header_types_list = { 666 "List of INFO fields": header_infos, 667 "List of FORMAT fields": header_formats, 668 } 669 i = 0 670 for header_type in header_types_list: 671 672 header_type_infos = header_types_list.get(header_type) 673 header_infos_dict = {} 674 675 for info in header_type_infos: 676 677 i += 1 678 header_infos_dict[i] = {} 679 680 # ID 681 header_infos_dict[i]["id"] = info 682 683 # num 684 genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"} 685 if header_type_infos[info].num in genotype_map.keys(): 686 header_infos_dict[i]["Number"] = genotype_map.get( 687 header_type_infos[info].num 688 ) 689 else: 690 header_infos_dict[i]["Number"] = header_type_infos[info].num 691 692 # type 693 if header_type_infos[info].type: 694 header_infos_dict[i]["Type"] = header_type_infos[info].type 695 else: 696 header_infos_dict[i]["Type"] = "." 697 698 # desc 699 if header_type_infos[info].desc != None: 700 header_infos_dict[i]["Description"] = header_type_infos[info].desc 701 else: 702 header_infos_dict[i]["Description"] = "" 703 704 if len(header_infos_dict): 705 header_types_df[header_type] = pd.DataFrame.from_dict( 706 header_infos_dict, orient="index" 707 ).to_dict(orient="index") 708 709 # Stats 710 stats["Infos"]["Number of INFO fields"] = len(header_infos_list) 711 stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list) 712 stats["Header"] = header_types_df 713 714 ### QUAL 715 if "QUAL" in self.get_header_columns(): 716 sql_query_qual = f""" 717 SELECT 718 avg(CAST(QUAL AS INTEGER)) AS Average, 719 min(CAST(QUAL AS INTEGER)) AS Minimum, 720 max(CAST(QUAL AS INTEGER)) AS Maximum, 721 stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation, 722 median(CAST(QUAL AS INTEGER)) AS Median, 723 variance(CAST(QUAL AS INTEGER)) AS Variance 724 FROM {table_variants_from} 725 WHERE CAST(QUAL AS VARCHAR) NOT IN ('.') 726 """ 727 728 qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index") 729 stats["Quality"] = {"Stats": qual} 730 731 ### SNV and InDel 732 733 sql_query_snv = f""" 734 735 SELECT Type, count FROM ( 736 737 SELECT 738 'Total' AS Type, 739 count(*) AS count 740 FROM {table_variants_from} 741 742 UNION 743 744 SELECT 745 'MNV' AS Type, 746 count(*) AS count 747 FROM {table_variants_from} 748 WHERE len(REF) > 1 AND len(ALT) > 1 749 AND len(REF) = len(ALT) 750 751 UNION 752 753 SELECT 754 'InDel' AS Type, 755 count(*) AS count 756 FROM {table_variants_from} 757 WHERE len(REF) > 1 OR len(ALT) > 1 758 AND len(REF) != len(ALT) 759 760 UNION 761 762 SELECT 763 'SNV' AS Type, 764 count(*) AS count 765 FROM {table_variants_from} 766 WHERE len(REF) = 1 AND len(ALT) = 1 767 768 ) 769 770 ORDER BY count DESC 771 772 """ 773 snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index") 774 775 sql_query_snv_substitution = f""" 776 SELECT 777 concat(REF, '>', ALT) AS 'Substitution', 778 count(*) AS count 779 FROM {table_variants_from} 780 WHERE len(REF) = 1 AND len(ALT) = 1 781 GROUP BY REF, ALT 782 ORDER BY count(*) DESC 783 """ 784 snv_substitution = ( 785 self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index") 786 ) 787 stats["Variants"]["Counts"] = snv_indel 788 stats["Variants"]["Substitutions"] = snv_substitution 789 790 return stats
The get_stats function calculates and returns various statistics of the current object,
including information about the input file, variants, samples, header fields, quality, and
SNVs/InDels.
Returns
a dictionary containing various statistics of the current object. The dictionary has the following structure:
792 def stats_to_file(self, file: str = None) -> str: 793 """ 794 The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them 795 into a JSON object, and writes the JSON object to the specified file. 796 797 :param file: The `file` parameter is a string that represents the file path where the JSON data 798 will be written 799 :type file: str 800 :return: the name of the file that was written to. 801 """ 802 803 # Get stats 804 stats = self.get_stats() 805 806 # Serializing json 807 json_object = json.dumps(stats, indent=4) 808 809 # Writing to sample.json 810 with open(file, "w") as outfile: 811 outfile.write(json_object) 812 813 return file
The function stats_to_file takes a file name as input, retrieves statistics, serializes them
into a JSON object, and writes the JSON object to the specified file.
Parameters
- file: The
fileparameter is a string that represents the file path where the JSON data will be written
Returns
the name of the file that was written to.
815 def print_stats(self, output_file: str = None, json_file: str = None) -> None: 816 """ 817 The `print_stats` function generates a markdown file and prints the statistics contained in a 818 JSON file in a formatted manner. 819 820 :param output_file: The `output_file` parameter is a string that specifies the path and filename 821 of the output file where the stats will be printed in Markdown format. If no `output_file` is 822 provided, a temporary directory will be created and the stats will be saved in a file named 823 "stats.md" within that 824 :type output_file: str 825 :param json_file: The `json_file` parameter is a string that represents the path to the JSON 826 file where the statistics will be saved. If no value is provided, a temporary directory will be 827 created and a default file name "stats.json" will be used 828 :type json_file: str 829 :return: The function `print_stats` does not return any value. It has a return type annotation 830 of `None`. 831 """ 832 833 # Full path 834 output_file = full_path(output_file) 835 json_file = full_path(json_file) 836 837 with tempfile.TemporaryDirectory() as tmpdir: 838 839 # Files 840 if not output_file: 841 output_file = os.path.join(tmpdir, "stats.md") 842 if not json_file: 843 json_file = os.path.join(tmpdir, "stats.json") 844 845 # Create folders 846 if not os.path.exists(os.path.dirname(output_file)): 847 Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True) 848 if not os.path.exists(os.path.dirname(json_file)): 849 Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True) 850 851 # Create stats JSON file 852 stats_file = self.stats_to_file(file=json_file) 853 854 # Print stats file 855 with open(stats_file) as f: 856 stats = yaml.safe_load(f) 857 858 # Output 859 output_title = [] 860 output_index = [] 861 output = [] 862 863 # Title 864 output_title.append("# HOWARD Stats") 865 866 # Index 867 output_index.append("## Index") 868 869 # Process sections 870 for section in stats: 871 infos = stats.get(section) 872 section_link = "#" + section.lower().replace(" ", "-") 873 output.append(f"## {section}") 874 output_index.append(f"- [{section}]({section_link})") 875 876 if len(infos): 877 for info in infos: 878 try: 879 df = pd.DataFrame.from_dict(infos.get(info), orient="index") 880 is_df = True 881 except: 882 try: 883 df = pd.DataFrame.from_dict( 884 json.loads((infos.get(info))), orient="index" 885 ) 886 is_df = True 887 except: 888 is_df = False 889 if is_df: 890 output.append(f"### {info}") 891 info_link = "#" + info.lower().replace(" ", "-") 892 output_index.append(f" - [{info}]({info_link})") 893 output.append(f"{df.to_markdown(index=False)}") 894 else: 895 output.append(f"- {info}: {infos.get(info)}") 896 else: 897 output.append(f"NA") 898 899 # Write stats in markdown file 900 with open(output_file, "w") as fp: 901 for item in output_title: 902 fp.write("%s\n" % item) 903 for item in output_index: 904 fp.write("%s\n" % item) 905 for item in output: 906 fp.write("%s\n" % item) 907 908 # Output stats in markdown 909 print("") 910 print("\n\n".join(output_title)) 911 print("") 912 print("\n\n".join(output)) 913 print("") 914 915 return None
The print_stats function generates a markdown file and prints the statistics contained in a
JSON file in a formatted manner.
Parameters
- output_file: The
output_fileparameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If nooutput_fileis provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that - json_file: The
json_fileparameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns
The function
print_statsdoes not return any value. It has a return type annotation ofNone.
917 def get_input(self) -> str: 918 """ 919 It returns the value of the input variable. 920 :return: The input is being returned. 921 """ 922 return self.input
It returns the value of the input variable.
Returns
The input is being returned.
924 def get_input_format(self, input_file: str = None) -> str: 925 """ 926 This function returns the format of the input variable, either from the provided input file or 927 by prompting for input. 928 929 :param input_file: The `input_file` parameter in the `get_input_format` method is a string that 930 represents the file path of the input file. If no `input_file` is provided when calling the 931 method, it will default to `None` 932 :type input_file: str 933 :return: The format of the input variable is being returned. 934 """ 935 936 if not input_file: 937 input_file = self.get_input() 938 input_format = get_file_format(input_file) 939 return input_format
This function returns the format of the input variable, either from the provided input file or by prompting for input.
Parameters
- input_file: The
input_fileparameter in theget_input_formatmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNone
Returns
The format of the input variable is being returned.
941 def get_input_compressed(self, input_file: str = None) -> str: 942 """ 943 The function `get_input_compressed` returns the format of the input variable after compressing 944 it. 945 946 :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string 947 that represents the file path of the input file. If no `input_file` is provided when calling the 948 method, it will default to `None` and the method will then call `self.get_input()` to 949 :type input_file: str 950 :return: The function `get_input_compressed` returns the compressed format of the input 951 variable. 952 """ 953 954 if not input_file: 955 input_file = self.get_input() 956 input_compressed = get_file_compressed(input_file) 957 return input_compressed
The function get_input_compressed returns the format of the input variable after compressing
it.
Parameters
- input_file: The
input_fileparameter in theget_input_compressedmethod is a string that represents the file path of the input file. If noinput_fileis provided when calling the method, it will default toNoneand the method will then callself.get_input()to
Returns
The function
get_input_compressedreturns the compressed format of the input variable.
959 def get_output(self) -> str: 960 """ 961 It returns the output of the neuron. 962 :return: The output of the neural network. 963 """ 964 965 return self.output
It returns the output of the neuron.
Returns
The output of the neural network.
967 def get_output_format(self, output_file: str = None) -> str: 968 """ 969 The function `get_output_format` returns the format of the input variable or the output file if 970 provided. 971 972 :param output_file: The `output_file` parameter in the `get_output_format` method is a string 973 that represents the file path of the output file. If no `output_file` is provided when calling 974 the method, it will default to the output obtained from the `get_output` method of the class 975 instance. The 976 :type output_file: str 977 :return: The format of the input variable is being returned. 978 """ 979 980 if not output_file: 981 output_file = self.get_output() 982 output_format = get_file_format(output_file) 983 984 return output_format
The function get_output_format returns the format of the input variable or the output file if
provided.
Parameters
- output_file: The
output_fileparameter in theget_output_formatmethod is a string that represents the file path of the output file. If nooutput_fileis provided when calling the method, it will default to the output obtained from theget_outputmethod of the class instance. The
Returns
The format of the input variable is being returned.
986 def get_config(self) -> dict: 987 """ 988 It returns the config 989 :return: The config variable is being returned. 990 """ 991 return self.config
It returns the config
Returns
The config variable is being returned.
993 def get_param(self) -> dict: 994 """ 995 It returns the param 996 :return: The param variable is being returned. 997 """ 998 return self.param
It returns the param
Returns
The param variable is being returned.
1000 def get_connexion_db(self) -> str: 1001 """ 1002 It returns the connexion_db attribute of the object 1003 :return: The connexion_db is being returned. 1004 """ 1005 return self.connexion_db
It returns the connexion_db attribute of the object
Returns
The connexion_db is being returned.
1007 def get_prefix(self) -> str: 1008 """ 1009 It returns the prefix of the object. 1010 :return: The prefix is being returned. 1011 """ 1012 return self.prefix
It returns the prefix of the object.
Returns
The prefix is being returned.
1014 def get_table_variants(self, clause: str = "select") -> str: 1015 """ 1016 This function returns the table_variants attribute of the object 1017 1018 :param clause: the type of clause the table will be used. Either "select" or "from" (optional), 1019 defaults to select (optional) 1020 :return: The table_variants attribute of the object. 1021 """ 1022 1023 # Access 1024 access = self.get_config().get("access", None) 1025 1026 # Clauses "select", "where", "update" 1027 if clause in ["select", "where", "update"]: 1028 table_variants = self.table_variants 1029 # Clause "from" 1030 elif clause in ["from"]: 1031 # For Read Only 1032 if self.get_input_format() in ["parquet"] and access in ["RO"]: 1033 input_file = self.get_input() 1034 table_variants = f"'{input_file}' as variants" 1035 # For Read Write 1036 else: 1037 table_variants = f"{self.table_variants} as variants" 1038 else: 1039 table_variants = self.table_variants 1040 return table_variants
This function returns the table_variants attribute of the object
Parameters
- clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns
The table_variants attribute of the object.
1042 def get_tmp_dir(self) -> str: 1043 """ 1044 The function `get_tmp_dir` returns the temporary directory path based on configuration 1045 parameters or a default path. 1046 :return: The `get_tmp_dir` method is returning the temporary directory path based on the 1047 configuration, parameters, and a default value of "/tmp". 1048 """ 1049 1050 return get_tmp( 1051 config=self.get_config(), param=self.get_param(), default_tmp="/tmp" 1052 )
The function get_tmp_dir returns the temporary directory path based on configuration
parameters or a default path.
Returns
The
get_tmp_dirmethod is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".
1054 def get_connexion_type(self) -> str: 1055 """ 1056 If the connexion type is not in the list of allowed connexion types, raise a ValueError 1057 1058 :return: The connexion type is being returned. 1059 """ 1060 return self.get_config().get("connexion_type", "memory")
If the connexion type is not in the list of allowed connexion types, raise a ValueError
Returns
The connexion type is being returned.
1062 def get_connexion(self): 1063 """ 1064 It returns the connection object 1065 1066 :return: The connection object. 1067 """ 1068 return self.conn
It returns the connection object
Returns
The connection object.
1070 def close_connexion(self) -> None: 1071 """ 1072 This function closes the connection to the database. 1073 :return: The connection is being closed. 1074 """ 1075 return self.conn.close()
This function closes the connection to the database.
Returns
The connection is being closed.
1077 def get_header(self, type: str = "vcf"): 1078 """ 1079 This function returns the header of the VCF file as a list of strings 1080 1081 :param type: the type of header you want to get, defaults to vcf (optional) 1082 :return: The header of the vcf file. 1083 """ 1084 1085 if self.header_vcf: 1086 if type == "vcf": 1087 return self.header_vcf 1088 elif type == "list": 1089 return self.header_list 1090 else: 1091 if type == "vcf": 1092 header = vcf.Reader(io.StringIO("\n".join(vcf_required))) 1093 return header 1094 elif type == "list": 1095 return vcf_required
This function returns the header of the VCF file as a list of strings
Parameters
- type: the type of header you want to get, defaults to vcf (optional)
Returns
The header of the vcf file.
1097 def get_header_infos_list(self) -> list: 1098 """ 1099 This function retrieves a list of information fields from the header. 1100 :return: A list of information fields from the header. 1101 """ 1102 1103 # Init 1104 infos_list = [] 1105 1106 for field in self.get_header().infos: 1107 infos_list.append(field) 1108 1109 return infos_list
This function retrieves a list of information fields from the header.
Returns
A list of information fields from the header.
1111 def get_header_length(self, file: str = None) -> int: 1112 """ 1113 The function `get_header_length` returns the length of the header list, excluding the #CHROM 1114 line. 1115 1116 :param file: The `file` parameter is an optional argument that specifies the path to a VCF 1117 header file. If this argument is provided, the function will read the header from the specified 1118 file and return the length of the header list minus 1 (to exclude the #CHROM line) 1119 :type file: str 1120 :return: the length of the header list, excluding the #CHROM line. 1121 """ 1122 1123 if file: 1124 return len(self.read_vcf_header_file(file=file)) - 1 1125 elif self.get_header(type="list"): 1126 return len(self.get_header(type="list")) - 1 1127 else: 1128 return 0
The function get_header_length returns the length of the header list, excluding the #CHROM
line.
Parameters
- file: The
fileparameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns
the length of the header list, excluding the #CHROM line.
1130 def get_header_columns(self) -> str: 1131 """ 1132 This function returns the header list of a VCF 1133 1134 :return: The length of the header list. 1135 """ 1136 if self.get_header(): 1137 return self.get_header(type="list")[-1] 1138 else: 1139 return ""
This function returns the header list of a VCF
Returns
The length of the header list.
1141 def get_header_columns_as_list(self) -> list: 1142 """ 1143 This function returns the header list of a VCF 1144 1145 :return: The length of the header list. 1146 """ 1147 if self.get_header(): 1148 return self.get_header_columns().strip().split("\t") 1149 else: 1150 return []
This function returns the header list of a VCF
Returns
The length of the header list.
1152 def get_header_columns_as_sql(self) -> str: 1153 """ 1154 This function retruns header length (without #CHROM line) 1155 1156 :return: The length of the header list. 1157 """ 1158 sql_column_list = [] 1159 for col in self.get_header_columns_as_list(): 1160 sql_column_list.append(f'"{col}"') 1161 return ",".join(sql_column_list)
This function retruns header length (without #CHROM line)
Returns
The length of the header list.
1163 def get_header_sample_list( 1164 self, check: bool = False, samples: list = None, samples_force: bool = False 1165 ) -> list: 1166 """ 1167 The function `get_header_sample_list` returns a list of samples from a VCF header, with optional 1168 checking and filtering based on input parameters. 1169 1170 :param check: The `check` parameter in the `get_header_sample_list` function is a boolean 1171 parameter that determines whether to check if the samples in the list are properly defined as 1172 genotype columns. If `check` is set to `True`, the function will verify if each sample in the 1173 list is defined as a, defaults to False 1174 :type check: bool (optional) 1175 :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that 1176 allows you to specify a subset of samples from the header. If you provide a list of sample 1177 names, the function will check if each sample is defined in the header. If a sample is not found 1178 in the 1179 :type samples: list 1180 :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is 1181 a boolean parameter that determines whether to force the function to return the sample list 1182 without checking if the samples are genotype columns. If `samples_force` is set to `True`, the 1183 function will return the sample list without performing, defaults to False 1184 :type samples_force: bool (optional) 1185 :return: The function `get_header_sample_list` returns a list of samples based on the input 1186 parameters and conditions specified in the function. 1187 """ 1188 1189 # Init 1190 samples_list = [] 1191 1192 if samples is None: 1193 samples_list = self.header_vcf.samples 1194 else: 1195 samples_checked = [] 1196 for sample in samples: 1197 if sample in self.header_vcf.samples: 1198 samples_checked.append(sample) 1199 else: 1200 log.warning(f"Sample '{sample}' not defined in header") 1201 samples_list = samples_checked 1202 1203 # Force sample list without checking if is_genotype_column 1204 if samples_force: 1205 log.warning(f"Samples {samples_list} not checked if genotypes") 1206 return samples_list 1207 1208 if check: 1209 samples_checked = [] 1210 for sample in samples_list: 1211 if self.is_genotype_column(column=sample): 1212 samples_checked.append(sample) 1213 else: 1214 log.warning( 1215 f"Sample '{sample}' not defined as a sample (genotype not well defined)" 1216 ) 1217 samples_list = samples_checked 1218 1219 # Return samples list 1220 return samples_list
The function get_header_sample_list returns a list of samples from a VCF header, with optional
checking and filtering based on input parameters.
Parameters
- check: The
checkparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. Ifcheckis set toTrue, the function will verify if each sample in the list is defined as a, defaults to False - samples: The
samplesparameter in theget_header_sample_listfunction is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the - samples_force: The
samples_forceparameter in theget_header_sample_listfunction is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. Ifsamples_forceis set toTrue, the function will return the sample list without performing, defaults to False
Returns
The function
get_header_sample_listreturns a list of samples based on the input parameters and conditions specified in the function.
1222 def is_genotype_column(self, column: str = None) -> bool: 1223 """ 1224 This function checks if a given column is a genotype column in a database. 1225 1226 :param column: The `column` parameter in the `is_genotype_column` method is a string that 1227 represents the column name in a database table. This method checks if the specified column is a 1228 genotype column in the database. If a column name is provided, it calls the `is_genotype_column` 1229 method of 1230 :type column: str 1231 :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter 1232 is not None, it calls the `is_genotype_column` method of the `Database` class with the specified 1233 column name and returns the result. If the `column` parameter is None, it returns False. 1234 """ 1235 1236 if column is not None: 1237 return Database(database=self.get_input()).is_genotype_column(column=column) 1238 else: 1239 return False
This function checks if a given column is a genotype column in a database.
Parameters
- column: The
columnparameter in theis_genotype_columnmethod is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls theis_genotype_columnmethod of
Returns
The
is_genotype_columnmethod is returning a boolean value. If thecolumnparameter is not None, it calls theis_genotype_columnmethod of theDatabaseclass with the specified column name and returns the result. If thecolumnparameter is None, it returns False.
1241 def get_verbose(self) -> bool: 1242 """ 1243 It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't 1244 exist 1245 1246 :return: The value of the key "verbose" in the config dictionary. 1247 """ 1248 return self.get_config().get("verbose", False)
It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist
Returns
The value of the key "verbose" in the config dictionary.
1250 def get_connexion_format(self) -> str: 1251 """ 1252 It returns the connexion format of the object. 1253 :return: The connexion_format is being returned. 1254 """ 1255 connexion_format = self.connexion_format 1256 if connexion_format not in ["duckdb", "sqlite"]: 1257 log.error(f"Unknown connexion format {connexion_format}") 1258 raise ValueError(f"Unknown connexion format {connexion_format}") 1259 else: 1260 return connexion_format
It returns the connexion format of the object.
Returns
The connexion_format is being returned.
1262 def insert_file_to_table( 1263 self, 1264 file, 1265 columns: str, 1266 header_len: int = 0, 1267 sep: str = "\t", 1268 chunksize: int = 1000000, 1269 ) -> None: 1270 """ 1271 The function reads a file in chunks and inserts each chunk into a table based on the specified 1272 database format. 1273 1274 :param file: The `file` parameter is the file that you want to load into a table. It should be 1275 the path to the file on your system 1276 :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that 1277 should contain the names of the columns in the table where the data will be inserted. The column 1278 names should be separated by commas within the string. For example, if you have columns named 1279 "id", "name 1280 :type columns: str 1281 :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies 1282 the number of lines to skip at the beginning of the file before reading the actual data. This 1283 parameter allows you to skip any header information present in the file before processing the 1284 data, defaults to 0 1285 :type header_len: int (optional) 1286 :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the 1287 separator character that is used in the file being read. In this case, the default separator is 1288 set to `\t`, which represents a tab character. You can change this parameter to a different 1289 separator character if, defaults to \t 1290 :type sep: str (optional) 1291 :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time 1292 when processing the file in chunks. In the provided code snippet, the default value for 1293 `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults 1294 to 1000000 1295 :type chunksize: int (optional) 1296 """ 1297 1298 # Config 1299 chunksize = self.get_config().get("load", {}).get("chunk", chunksize) 1300 connexion_format = self.get_connexion_format() 1301 1302 log.debug("chunksize: " + str(chunksize)) 1303 1304 if chunksize: 1305 for chunk in pd.read_csv( 1306 file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c" 1307 ): 1308 if connexion_format in ["duckdb"]: 1309 sql_insert_into = ( 1310 f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk" 1311 ) 1312 self.conn.execute(sql_insert_into) 1313 elif connexion_format in ["sqlite"]: 1314 chunk.to_sql("variants", self.conn, if_exists="append", index=False)
The function reads a file in chunks and inserts each chunk into a table based on the specified database format.
Parameters
- file: The
fileparameter is the file that you want to load into a table. It should be the path to the file on your system - columns: The
columnsparameter in theinsert_file_to_tablefunction is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name - header_len: The
header_lenparameter in theinsert_file_to_tablefunction specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0 - sep: The
sepparameter in theinsert_file_to_tablefunction is used to specify the separator character that is used in the file being read. In this case, the default separator is set to, which represents a tab character. You can change this parameter to a different separator character if, defaults to - chunksize: The
chunksizeparameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value forchunksizeis set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
1316 def load_data( 1317 self, 1318 input_file: str = None, 1319 drop_variants_table: bool = False, 1320 sample_size: int = 20480, 1321 ) -> None: 1322 """ 1323 The `load_data` function reads a VCF file and inserts it into a table, with options to drop the 1324 table before loading the data and specify a sample size. 1325 1326 :param input_file: The path to the input file. This is the VCF file that will be loaded into the 1327 table 1328 :type input_file: str 1329 :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that 1330 determines whether the variants table should be dropped before loading the data. If set to 1331 `True`, the variants table will be dropped. If set to `False` (default), the variants table will 1332 not be dropped, defaults to False 1333 :type drop_variants_table: bool (optional) 1334 :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from 1335 the input file. If it is set to `None`, the default value of 20480 will be used, defaults to 1336 20480 1337 :type sample_size: int (optional) 1338 """ 1339 1340 log.info("Loading...") 1341 1342 # change input file 1343 if input_file: 1344 self.set_input(input_file) 1345 self.set_header() 1346 1347 # drop variants table 1348 if drop_variants_table: 1349 self.drop_variants_table() 1350 1351 # get table variants 1352 table_variants = self.get_table_variants() 1353 1354 # Access 1355 access = self.get_config().get("access", None) 1356 log.debug(f"access: {access}") 1357 1358 # Input format and compress 1359 input_format = self.get_input_format() 1360 input_compressed = self.get_input_compressed() 1361 log.debug(f"input_format: {input_format}") 1362 log.debug(f"input_compressed: {input_compressed}") 1363 1364 # input_compressed_format 1365 if input_compressed: 1366 input_compressed_format = "gzip" 1367 else: 1368 input_compressed_format = "none" 1369 log.debug(f"input_compressed_format: {input_compressed_format}") 1370 1371 # Connexion format 1372 connexion_format = self.get_connexion_format() 1373 1374 # Sample size 1375 if not sample_size: 1376 sample_size = -1 1377 log.debug(f"sample_size: {sample_size}") 1378 1379 # Load data 1380 log.debug(f"Load Data from {input_format}") 1381 1382 # DuckDB connexion 1383 if connexion_format in ["duckdb"]: 1384 1385 # Database already exists 1386 if self.input_format in ["db", "duckdb"]: 1387 1388 if connexion_format in ["duckdb"]: 1389 log.debug(f"Input file format '{self.input_format}' duckDB") 1390 else: 1391 log.error( 1392 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1393 ) 1394 raise ValueError( 1395 f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'" 1396 ) 1397 1398 # Load from existing database format 1399 else: 1400 1401 try: 1402 # Create Table or View 1403 database = Database(database=self.input) 1404 sql_from = database.get_sql_from(sample_size=sample_size) 1405 1406 if access in ["RO"]: 1407 sql_load = ( 1408 f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}" 1409 ) 1410 else: 1411 sql_load = ( 1412 f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}" 1413 ) 1414 self.conn.execute(sql_load) 1415 1416 except: 1417 # Format not available 1418 log.error(f"Input file format '{self.input_format}' not available") 1419 raise ValueError( 1420 f"Input file format '{self.input_format}' not available" 1421 ) 1422 1423 # SQLite connexion 1424 elif connexion_format in ["sqlite"] and input_format in [ 1425 "vcf", 1426 "tsv", 1427 "csv", 1428 "psv", 1429 ]: 1430 1431 # Main structure 1432 structure = { 1433 "#CHROM": "VARCHAR", 1434 "POS": "INTEGER", 1435 "ID": "VARCHAR", 1436 "REF": "VARCHAR", 1437 "ALT": "VARCHAR", 1438 "QUAL": "VARCHAR", 1439 "FILTER": "VARCHAR", 1440 "INFO": "VARCHAR", 1441 } 1442 1443 # Strcuture with samples 1444 structure_complete = structure 1445 if self.get_header_sample_list(): 1446 structure["FORMAT"] = "VARCHAR" 1447 for sample in self.get_header_sample_list(): 1448 structure_complete[sample] = "VARCHAR" 1449 1450 # Columns list for create and insert 1451 sql_create_table_columns = [] 1452 sql_create_table_columns_list = [] 1453 for column in structure_complete: 1454 column_type = structure_complete[column] 1455 sql_create_table_columns.append( 1456 f'"{column}" {column_type} default NULL' 1457 ) 1458 sql_create_table_columns_list.append(f'"{column}"') 1459 1460 # Create database 1461 log.debug(f"Create Table {table_variants}") 1462 sql_create_table_columns_sql = ", ".join(sql_create_table_columns) 1463 sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list) 1464 sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})" 1465 self.conn.execute(sql_create_table) 1466 1467 # chunksize define length of file chunk load file 1468 chunksize = 100000 1469 1470 # delimiter 1471 delimiter = file_format_delimiters.get(input_format, "\t") 1472 1473 # Load the input file 1474 with open(self.input, "rt") as input_file: 1475 1476 # Use the appropriate file handler based on the input format 1477 if input_compressed: 1478 input_file = bgzf.open(self.input, "rt") 1479 if input_format in ["vcf"]: 1480 header_len = self.get_header_length() 1481 else: 1482 header_len = 0 1483 1484 # Insert the file contents into a table 1485 self.insert_file_to_table( 1486 input_file, 1487 columns=sql_create_table_columns_list_sql, 1488 header_len=header_len, 1489 sep=delimiter, 1490 chunksize=chunksize, 1491 ) 1492 1493 else: 1494 log.error( 1495 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1496 ) 1497 raise ValueError( 1498 f"Connexion format '{connexion_format}' not available with format '{input_format}'" 1499 ) 1500 1501 # Explode INFOS fields into table fields 1502 if self.get_explode_infos(): 1503 self.explode_infos( 1504 prefix=self.get_explode_infos_prefix(), 1505 fields=self.get_explode_infos_fields(), 1506 force=True, 1507 ) 1508 1509 # Create index after insertion 1510 self.create_indexes()
The load_data function reads a VCF file and inserts it into a table, with options to drop the
table before loading the data and specify a sample size.
Parameters
- input_file: The path to the input file. This is the VCF file that will be loaded into the table
- drop_variants_table: The
drop_variants_tableparameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set toTrue, the variants table will be dropped. If set toFalse(default), the variants table will not be dropped, defaults to False - sample_size: The
sample_sizeparameter determines the number of rows to be sampled from the input file. If it is set toNone, the default value of 20480 will be used, defaults to 20480
1512 def get_explode_infos(self) -> bool: 1513 """ 1514 The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting 1515 to False if it is not set. 1516 :return: The method is returning the value of the "explode_infos" parameter, which is a boolean 1517 value. If the parameter is not present, it will return False. 1518 """ 1519 1520 return self.get_param().get("explode", {}).get("explode_infos", False)
The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting
to False if it is not set.
Returns
The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.
1522 def get_explode_infos_fields( 1523 self, 1524 explode_infos_fields: str = None, 1525 remove_fields_not_in_header: bool = False, 1526 ) -> list: 1527 """ 1528 The `get_explode_infos_fields` function returns a list of exploded information fields based on 1529 the input parameter `explode_infos_fields`. 1530 1531 :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the 1532 fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a 1533 comma-separated list of field names to explode 1534 :type explode_infos_fields: str 1535 :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean 1536 flag that determines whether to remove fields that are not present in the header. If it is set 1537 to `True`, any field that is not in the header will be excluded from the list of exploded 1538 information fields. If it is set to `, defaults to False 1539 :type remove_fields_not_in_header: bool (optional) 1540 :return: The function `get_explode_infos_fields` returns a list of exploded information fields. 1541 If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty 1542 list. If the parameter is provided and its value is "ALL", it also returns an empty list. 1543 Otherwise, it returns a list of exploded information fields after removing any spaces and 1544 splitting the string by commas. 1545 """ 1546 1547 # If no fields, get it in param 1548 if not explode_infos_fields: 1549 explode_infos_fields = ( 1550 self.get_param().get("explode", {}).get("explode_infos_fields", None) 1551 ) 1552 1553 # If no fields, defined as all fields in header using keyword 1554 if not explode_infos_fields: 1555 explode_infos_fields = "*" 1556 1557 # If fields list not empty 1558 if explode_infos_fields: 1559 1560 # Input fields list 1561 if isinstance(explode_infos_fields, str): 1562 fields_input = explode_infos_fields.split(",") 1563 elif isinstance(explode_infos_fields, list): 1564 fields_input = explode_infos_fields 1565 else: 1566 fields_input = [] 1567 1568 # Fields list without * keyword 1569 fields_without_all = fields_input.copy() 1570 if "*".casefold() in (item.casefold() for item in fields_without_all): 1571 fields_without_all.remove("*") 1572 1573 # Fields in header 1574 fields_in_header = sorted(list(set(self.get_header().infos))) 1575 1576 # Construct list of fields 1577 fields_output = [] 1578 for field in fields_input: 1579 1580 # Strip field 1581 field = field.strip() 1582 1583 # format keyword * in regex 1584 if field.upper() in ["*"]: 1585 field = ".*" 1586 1587 # Find all fields with pattern 1588 r = re.compile(field) 1589 fields_search = sorted(list(filter(r.match, fields_in_header))) 1590 1591 # Remove fields input from search 1592 if field in fields_search: 1593 fields_search = [field] 1594 elif fields_search != [field]: 1595 fields_search = sorted( 1596 list(set(fields_search).difference(fields_input)) 1597 ) 1598 1599 # If field is not in header (avoid not well formatted header) 1600 if not fields_search and not remove_fields_not_in_header: 1601 fields_search = [field] 1602 1603 # Add found fields 1604 for new_field in fields_search: 1605 # Add field, if not already exists, and if it is in header (if asked) 1606 if ( 1607 new_field not in fields_output 1608 and ( 1609 not remove_fields_not_in_header 1610 or new_field in fields_in_header 1611 ) 1612 and new_field not in [".*"] 1613 ): 1614 fields_output.append(new_field) 1615 1616 return fields_output 1617 1618 else: 1619 1620 return []
The get_explode_infos_fields function returns a list of exploded information fields based on
the input parameter explode_infos_fields.
Parameters
- explode_infos_fields: The
explode_infos_fieldsparameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode - remove_fields_not_in_header: The parameter
remove_fields_not_in_headeris a boolean flag that determines whether to remove fields that are not present in the header. If it is set toTrue, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns
The function
get_explode_infos_fieldsreturns a list of exploded information fields. If theexplode_infos_fieldsparameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.
1622 def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str: 1623 """ 1624 The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or 1625 the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is 1626 not provided. 1627 1628 :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a 1629 prefix to be used for exploding or expanding information 1630 :type explode_infos_prefix: str 1631 :return: the value of the variable `explode_infos_prefix`. 1632 """ 1633 1634 if not explode_infos_prefix: 1635 explode_infos_prefix = ( 1636 self.get_param().get("explode", {}).get("explode_infos_prefix", "") 1637 ) 1638 1639 return explode_infos_prefix
The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or
the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is
not provided.
Parameters
- explode_infos_prefix: The parameter
explode_infos_prefixis a string that specifies a prefix to be used for exploding or expanding information
Returns
the value of the variable
explode_infos_prefix.
1641 def add_column( 1642 self, 1643 table_name, 1644 column_name, 1645 column_type, 1646 default_value=None, 1647 drop: bool = False, 1648 ) -> dict: 1649 """ 1650 The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it 1651 doesn't already exist. 1652 1653 :param table_name: The name of the table to which you want to add a column 1654 :param column_name: The parameter "column_name" is the name of the column that you want to add 1655 to the table 1656 :param column_type: The `column_type` parameter specifies the data type of the column that you 1657 want to add to the table. It should be a string that represents the desired data type, such as 1658 "INTEGER", "TEXT", "REAL", etc 1659 :param default_value: The `default_value` parameter is an optional parameter that specifies the 1660 default value for the newly added column. If a default value is provided, it will be assigned to 1661 the column for any existing rows that do not have a value for that column 1662 :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column 1663 if it already exists in the table. If `drop` is set to `True`, the function will drop the 1664 existing column before adding the new column. If `drop` is set to `False` (default),, defaults 1665 to False 1666 :type drop: bool (optional) 1667 :return: a boolean value indicating whether the column was successfully added to the table. 1668 """ 1669 1670 # added 1671 added = False 1672 dropped = False 1673 1674 # Check if the column already exists in the table 1675 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1676 columns = self.get_query_to_df(query).columns.tolist() 1677 if column_name.upper() in [c.upper() for c in columns]: 1678 log.debug( 1679 f"The {column_name} column already exists in the {table_name} table" 1680 ) 1681 if drop: 1682 self.drop_column(table_name=table_name, column_name=column_name) 1683 dropped = True 1684 else: 1685 return None 1686 else: 1687 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1688 1689 # Add column in table 1690 add_column_query = ( 1691 f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """ 1692 ) 1693 if default_value is not None: 1694 add_column_query += f" DEFAULT {default_value}" 1695 self.execute_query(add_column_query) 1696 added = not dropped 1697 log.debug( 1698 f"The {column_name} column was successfully added to the {table_name} table" 1699 ) 1700 1701 if added: 1702 added_column = { 1703 "table_name": table_name, 1704 "column_name": column_name, 1705 "column_type": column_type, 1706 "default_value": default_value, 1707 } 1708 else: 1709 added_column = None 1710 1711 return added_column
The add_column function adds a column to a SQLite or DuckDB table with a default value if it
doesn't already exist.
Parameters
- table_name: The name of the table to which you want to add a column
- column_name: The parameter "column_name" is the name of the column that you want to add to the table
- column_type: The
column_typeparameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc - default_value: The
default_valueparameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column - drop: The
dropparameter is a boolean flag that determines whether to drop the column if it already exists in the table. Ifdropis set toTrue, the function will drop the existing column before adding the new column. Ifdropis set toFalse(default),, defaults to False
Returns
a boolean value indicating whether the column was successfully added to the table.
1713 def drop_column( 1714 self, column: dict = None, table_name: str = None, column_name: str = None 1715 ) -> bool: 1716 """ 1717 The `drop_column` function drops a specified column from a given table in a database and returns 1718 True if the column was successfully dropped, and False if the column does not exist in the 1719 table. 1720 1721 :param column: The `column` parameter is a dictionary that contains information about the column 1722 you want to drop. It has two keys: 1723 :type column: dict 1724 :param table_name: The `table_name` parameter is the name of the table from which you want to 1725 drop a column 1726 :type table_name: str 1727 :param column_name: The `column_name` parameter is the name of the column that you want to drop 1728 from the table 1729 :type column_name: str 1730 :return: a boolean value. It returns True if the column was successfully dropped from the table, 1731 and False if the column does not exist in the table. 1732 """ 1733 1734 # Find column infos 1735 if column: 1736 if isinstance(column, dict): 1737 table_name = column.get("table_name", None) 1738 column_name = column.get("column_name", None) 1739 elif isinstance(column, str): 1740 table_name = self.get_table_variants() 1741 column_name = column 1742 else: 1743 table_name = None 1744 column_name = None 1745 1746 if not table_name and not column_name: 1747 return False 1748 1749 # Removed 1750 removed = False 1751 1752 # Check if the column already exists in the table 1753 query = f""" SELECT * FROM {table_name} LIMIT 0 """ 1754 columns = self.get_query_to_df(query).columns.tolist() 1755 if column_name in columns: 1756 log.debug(f"The {column_name} column exists in the {table_name} table") 1757 else: 1758 log.debug(f"The {column_name} column NOT exists in the {table_name} table") 1759 return False 1760 1761 # Add column in table # ALTER TABLE integers DROP k 1762 add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """ 1763 self.execute_query(add_column_query) 1764 removed = True 1765 log.debug( 1766 f"The {column_name} column was successfully dropped to the {table_name} table" 1767 ) 1768 1769 return removed
The drop_column function drops a specified column from a given table in a database and returns
True if the column was successfully dropped, and False if the column does not exist in the
table.
Parameters
- column: The
columnparameter is a dictionary that contains information about the column you want to drop. It has two keys: - table_name: The
table_nameparameter is the name of the table from which you want to drop a column - column_name: The
column_nameparameter is the name of the column that you want to drop from the table
Returns
a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.
1771 def explode_infos( 1772 self, 1773 prefix: str = None, 1774 create_index: bool = False, 1775 fields: list = None, 1776 force: bool = False, 1777 proccess_all_fields_together: bool = False, 1778 table: str = None, 1779 ) -> list: 1780 """ 1781 The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into 1782 individual columns, returning a list of added columns. 1783 1784 :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO 1785 fields. If the `prefix` is not provided or is set to `None`, the function will use the value of 1786 `self.get_explode_infos_prefix()` as the prefix 1787 :type prefix: str 1788 :param create_index: The `create_index` parameter is a boolean flag that specifies whether to 1789 create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to 1790 `False`, indexes will not be created. The default value is `False`, defaults to False 1791 :type create_index: bool (optional) 1792 :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields 1793 that you want to explode into individual columns. If this parameter is not provided, all INFO 1794 fields will be exploded. You can specify the INFO fields you want to explode by passing them as 1795 a list to the ` 1796 :type fields: list 1797 :param force: The `force` parameter in the `explode_infos` function is a boolean flag that 1798 determines whether to drop and recreate a column if it already exists in the table. If `force` 1799 is set to `True`, the column will be dropped and recreated. If `force` is set to `False, 1800 defaults to False 1801 :type force: bool (optional) 1802 :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean 1803 flag that determines whether to process all the INFO fields together or individually. If set to 1804 `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will 1805 be processed individually. The default value is, defaults to False 1806 :type proccess_all_fields_together: bool (optional) 1807 :param table: The `table` parameter in the `explode_infos` function is used to specify the name 1808 of the table where the exploded INFO fields will be added as individual columns. If you provide 1809 a value for the `table` parameter, the function will use that table name. If the `table` 1810 parameter is 1811 :type table: str 1812 :return: The `explode_infos` function returns a list of added columns. 1813 """ 1814 1815 # drop indexes 1816 self.drop_indexes() 1817 1818 # connexion format 1819 connexion_format = self.get_connexion_format() 1820 1821 # Access 1822 access = self.get_config().get("access", None) 1823 1824 # Added columns 1825 added_columns = [] 1826 1827 if access not in ["RO"]: 1828 1829 # prefix 1830 if prefix in [None, True] or not isinstance(prefix, str): 1831 if self.get_explode_infos_prefix() not in [None, True]: 1832 prefix = self.get_explode_infos_prefix() 1833 else: 1834 prefix = "INFO/" 1835 1836 # table variants 1837 if table is not None: 1838 table_variants = table 1839 else: 1840 table_variants = self.get_table_variants(clause="select") 1841 1842 # extra infos 1843 try: 1844 extra_infos = self.get_extra_infos() 1845 except: 1846 extra_infos = [] 1847 1848 # Header infos 1849 header_infos = self.get_header().infos 1850 1851 log.debug( 1852 f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields" 1853 ) 1854 1855 sql_info_alter_table_array = [] 1856 1857 # Info fields to check 1858 fields_list = list(header_infos) 1859 if fields: 1860 fields_list += fields 1861 fields_list = set(fields_list) 1862 1863 # If no fields 1864 if not fields: 1865 fields = [] 1866 1867 # Translate fields if patterns 1868 fields = self.get_explode_infos_fields(explode_infos_fields=fields) 1869 1870 for info in fields: 1871 1872 info_id_sql = prefix + info 1873 1874 if ( 1875 info in fields_list 1876 or prefix + info in fields_list 1877 or info in extra_infos 1878 ): 1879 1880 log.debug(f"Explode INFO fields - ADD '{info}' annotations fields") 1881 1882 if info in header_infos: 1883 info_type = header_infos[info].type 1884 info_num = header_infos[info].num 1885 else: 1886 info_type = "String" 1887 info_num = 0 1888 1889 type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR") 1890 if info_num != 1: 1891 type_sql = "VARCHAR" 1892 1893 # Add field 1894 added_column = self.add_column( 1895 table_name=table_variants, 1896 column_name=info_id_sql, 1897 column_type=type_sql, 1898 default_value="null", 1899 drop=force, 1900 ) 1901 1902 if added_column: 1903 added_columns.append(added_column) 1904 1905 if added_column or force: 1906 1907 # add field to index 1908 self.index_additionnal_fields.append(info_id_sql) 1909 1910 # Update field array 1911 if connexion_format in ["duckdb"]: 1912 update_info_field = f""" 1913 "{info_id_sql}" = 1914 CASE 1915 WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL 1916 ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) 1917 END 1918 """ 1919 elif connexion_format in ["sqlite"]: 1920 update_info_field = f""" 1921 "{info_id_sql}" = 1922 CASE 1923 WHEN instr(INFO, '{info}=') = 0 THEN NULL 1924 WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1) 1925 ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1) 1926 END 1927 """ 1928 1929 sql_info_alter_table_array.append(update_info_field) 1930 1931 if sql_info_alter_table_array: 1932 1933 # By chromosomes 1934 try: 1935 chromosomes_list = list( 1936 self.get_query_to_df( 1937 f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """ 1938 )["#CHROM"] 1939 ) 1940 except: 1941 chromosomes_list = [None] 1942 1943 for chrom in chromosomes_list: 1944 log.debug(f"Explode INFO fields - Chromosome {chrom}...") 1945 1946 # Where clause 1947 where_clause = "" 1948 if chrom and len(chromosomes_list) > 1: 1949 where_clause = f""" WHERE "#CHROM" = '{chrom}' """ 1950 1951 # Update table 1952 if proccess_all_fields_together: 1953 sql_info_alter_table_array_join = ", ".join( 1954 sql_info_alter_table_array 1955 ) 1956 if sql_info_alter_table_array_join: 1957 sql_info_alter_table = f""" 1958 UPDATE {table_variants} 1959 SET {sql_info_alter_table_array_join} 1960 {where_clause} 1961 """ 1962 log.debug( 1963 f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..." 1964 ) 1965 # log.debug(sql_info_alter_table) 1966 self.conn.execute(sql_info_alter_table) 1967 else: 1968 sql_info_alter_num = 0 1969 for sql_info_alter in sql_info_alter_table_array: 1970 sql_info_alter_num += 1 1971 sql_info_alter_table = f""" 1972 UPDATE {table_variants} 1973 SET {sql_info_alter} 1974 {where_clause} 1975 """ 1976 log.debug( 1977 f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..." 1978 ) 1979 # log.debug(sql_info_alter_table) 1980 self.conn.execute(sql_info_alter_table) 1981 1982 # create indexes 1983 if create_index: 1984 self.create_indexes() 1985 1986 return added_columns
The explode_infos function in Python takes a VCF file and explodes the INFO fields into
individual columns, returning a list of added columns.
Parameters
- prefix: The
prefixparameter is a string that is used as a prefix for the exploded INFO fields. If theprefixis not provided or is set toNone, the function will use the value ofself.get_explode_infos_prefix()as the prefix - create_index: The
create_indexparameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set toTrue, indexes will be created; if set toFalse, indexes will not be created. The default value isFalse, defaults to False - fields: The
fieldsparameter in theexplode_infosfunction is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the ` - force: The
forceparameter in theexplode_infosfunction is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. Ifforceis set toTrue, the column will be dropped and recreated. Ifforceis set to `False, defaults to False - proccess_all_fields_together: The
proccess_all_fields_togetherparameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set toTrue, all the INFO fields will be processed together. If set toFalse, each INFO field will be processed individually. The default value is, defaults to False - table: The
tableparameter in theexplode_infosfunction is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for thetableparameter, the function will use that table name. If thetableparameter is
Returns
The
explode_infosfunction returns a list of added columns.
1988 def create_indexes(self) -> None: 1989 """ 1990 Create indexes on the table after insertion 1991 """ 1992 1993 # Access 1994 access = self.get_config().get("access", None) 1995 1996 # get table variants 1997 table_variants = self.get_table_variants("FROM") 1998 1999 if self.get_indexing() and access not in ["RO"]: 2000 # Create index 2001 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")' 2002 self.conn.execute(sql_create_table_index) 2003 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")' 2004 self.conn.execute(sql_create_table_index) 2005 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")' 2006 self.conn.execute(sql_create_table_index) 2007 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")' 2008 self.conn.execute(sql_create_table_index) 2009 sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")' 2010 self.conn.execute(sql_create_table_index) 2011 for field in self.index_additionnal_fields: 2012 sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """ 2013 self.conn.execute(sql_create_table_index)
Create indexes on the table after insertion
2015 def drop_indexes(self) -> None: 2016 """ 2017 Create indexes on the table after insertion 2018 """ 2019 2020 # Access 2021 access = self.get_config().get("access", None) 2022 2023 # get table variants 2024 table_variants = self.get_table_variants("FROM") 2025 2026 # Get database format 2027 connexion_format = self.get_connexion_format() 2028 2029 if access not in ["RO"]: 2030 if connexion_format in ["duckdb"]: 2031 sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'" 2032 elif connexion_format in ["sqlite"]: 2033 sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';" 2034 2035 list_indexes = self.conn.execute(sql_list_indexes) 2036 index_names = [row[0] for row in list_indexes.fetchall()] 2037 for index in index_names: 2038 sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """ 2039 self.conn.execute(sql_drop_table_index)
Create indexes on the table after insertion
2041 def read_vcf_header(self, f) -> list: 2042 """ 2043 It reads the header of a VCF file and returns a list of the header lines 2044 2045 :param f: the file object 2046 :return: The header lines of the VCF file. 2047 """ 2048 2049 header_list = [] 2050 for line in f: 2051 header_list.append(line) 2052 if line.startswith("#CHROM"): 2053 break 2054 return header_list
It reads the header of a VCF file and returns a list of the header lines
Parameters
- f: the file object
Returns
The header lines of the VCF file.
2056 def read_vcf_header_file(self, file: str = None) -> list: 2057 """ 2058 The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and 2059 uncompressed files. 2060 2061 :param file: The `file` parameter is a string that represents the path to the VCF header file 2062 that you want to read. It is an optional parameter, so if you don't provide a value, it will 2063 default to `None` 2064 :type file: str 2065 :return: The function `read_vcf_header_file` returns a list. 2066 """ 2067 2068 if self.get_input_compressed(input_file=file): 2069 with bgzf.open(file, "rt") as f: 2070 return self.read_vcf_header(f=f) 2071 else: 2072 with open(file, "rt") as f: 2073 return self.read_vcf_header(f=f)
The read_vcf_header_file function reads the header of a VCF file, handling both compressed and
uncompressed files.
Parameters
- file: The
fileparameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default toNone
Returns
The function
read_vcf_header_filereturns a list.
2075 def execute_query(self, query: str): 2076 """ 2077 It takes a query as an argument, executes it, and returns the results 2078 2079 :param query: The query to be executed 2080 :return: The result of the query is being returned. 2081 """ 2082 if query: 2083 return self.conn.execute(query) # .fetchall() 2084 else: 2085 return None
It takes a query as an argument, executes it, and returns the results
Parameters
- query: The query to be executed
Returns
The result of the query is being returned.
2087 def export_output( 2088 self, 2089 output_file: str | None = None, 2090 output_header: str | None = None, 2091 export_header: bool = True, 2092 query: str | None = None, 2093 parquet_partitions: list | None = None, 2094 chunk_size: int | None = None, 2095 threads: int | None = None, 2096 sort: bool = False, 2097 index: bool = False, 2098 order_by: str | None = None, 2099 ) -> bool: 2100 """ 2101 The `export_output` function exports data from a VCF file to a specified output file in various 2102 formats, including VCF, CSV, TSV, PSV, and Parquet. 2103 2104 :param output_file: The `output_file` parameter is a string that specifies the name of the 2105 output file to be generated by the function. This is where the exported data will be saved 2106 :type output_file: str 2107 :param output_header: The `output_header` parameter is a string that specifies the name of the 2108 file where the header of the VCF file will be exported. If this parameter is not provided, the 2109 header will be exported to a file with the same name as the `output_file` parameter, but with 2110 the extension " 2111 :type output_header: str 2112 :param export_header: The `export_header` parameter is a boolean flag that determines whether 2113 the header of a VCF file should be exported to a separate file or not. If `export_header` is 2114 True, the header will be exported to a file. If `export_header` is False, the header will not 2115 be, defaults to True, if output format is not VCF 2116 :type export_header: bool (optional) 2117 :param query: The `query` parameter is an optional SQL query that can be used to filter and 2118 select specific data from the VCF file before exporting it. If provided, only the data that 2119 matches the query will be exported 2120 :type query: str 2121 :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the 2122 columns to be used for partitioning the Parquet file during export. Partitioning is a way to 2123 organize data in a hierarchical directory structure based on the values of one or more columns. 2124 This can improve query performance when working with large datasets 2125 :type parquet_partitions: list 2126 :param chunk_size: The `chunk_size` parameter specifies the number of 2127 records in batch when exporting data in Parquet format. This parameter is used for 2128 partitioning the Parquet file into multiple files. 2129 :type chunk_size: int 2130 :param threads: The `threads` parameter is an optional parameter that specifies the number of 2131 threads to be used during the export process. It determines the level of parallelism and can 2132 improve the performance of the export operation. If not provided, the function will use the 2133 default number of threads 2134 :type threads: int 2135 :param sort: The `sort` parameter is a boolean flag that determines whether the output file 2136 should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the 2137 genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to 2138 False 2139 :type sort: bool (optional) 2140 :param index: The `index` parameter is a boolean flag that determines whether an index should be 2141 created on the output file. If `index` is True, an index will be created. If `index` is False, 2142 no index will be created. The default value is False, defaults to False 2143 :type index: bool (optional) 2144 :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for 2145 sorting the output file. This parameter is only applicable when exporting data in VCF format 2146 :type order_by: str 2147 :return: a boolean value. It checks if the output file exists and returns True if it does, or 2148 None if it doesn't. 2149 """ 2150 2151 # Log 2152 log.info("Exporting...") 2153 2154 # Full path 2155 output_file = full_path(output_file) 2156 output_header = full_path(output_header) 2157 2158 # Config 2159 config = self.get_config() 2160 2161 # Param 2162 param = self.get_param() 2163 2164 # Tmp files to remove 2165 tmp_to_remove = [] 2166 2167 # If no output, get it 2168 if not output_file: 2169 output_file = self.get_output() 2170 2171 # If not threads 2172 if not threads: 2173 threads = self.get_threads() 2174 2175 # Auto header name with extension 2176 if export_header or output_header: 2177 if not output_header: 2178 output_header = f"{output_file}.hdr" 2179 # Export header 2180 self.export_header(output_file=output_file) 2181 2182 # Switch off export header if VCF output 2183 output_file_type = get_file_format(output_file) 2184 if output_file_type in ["vcf"]: 2185 export_header = False 2186 tmp_to_remove.append(output_header) 2187 2188 # Chunk size 2189 if not chunk_size: 2190 chunk_size = config.get("chunk_size", None) 2191 2192 # Parquet partition 2193 if not parquet_partitions: 2194 parquet_partitions = param.get("export", {}).get("parquet_partitions", None) 2195 if parquet_partitions and isinstance(parquet_partitions, str): 2196 parquet_partitions = parquet_partitions.split(",") 2197 2198 # Order by 2199 if not order_by: 2200 order_by = param.get("export", {}).get("order_by", "") 2201 2202 # Header in output 2203 header_in_output = param.get("export", {}).get("include_header", False) 2204 2205 # Database 2206 database_source = self.get_connexion() 2207 2208 # Connexion format 2209 connexion_format = self.get_connexion_format() 2210 2211 # Explode infos 2212 if self.get_explode_infos(): 2213 self.explode_infos( 2214 prefix=self.get_explode_infos_prefix(), 2215 fields=self.get_explode_infos_fields(), 2216 force=False, 2217 ) 2218 2219 # if connexion_format in ["sqlite"] or query: 2220 if connexion_format in ["sqlite"]: 2221 2222 # Export in Parquet 2223 random_tmp = "".join( 2224 random.choice(string.ascii_lowercase) for i in range(10) 2225 ) 2226 database_source = f"""{output_file}.{random_tmp}.database_export.parquet""" 2227 tmp_to_remove.append(database_source) 2228 2229 # Table Variants 2230 table_variants = self.get_table_variants() 2231 2232 # Create export query 2233 sql_query_export_subquery = f""" 2234 SELECT * FROM {table_variants} 2235 """ 2236 2237 # Write source file 2238 fp.write(database_source, self.get_query_to_df(sql_query_export_subquery)) 2239 2240 # Create database 2241 database = Database( 2242 database=database_source, 2243 table="variants", 2244 header_file=output_header, 2245 conn_config=self.get_connexion_config(), 2246 ) 2247 2248 # Existing colomns header 2249 existing_columns_header = database.get_header_columns_from_database(query=query) 2250 2251 # Sample list 2252 if output_file_type in ["vcf"]: 2253 get_samples = self.get_samples() 2254 get_samples_check = self.get_samples_check() 2255 samples_force = get_samples is not None 2256 sample_list = self.get_header_sample_list( 2257 check=get_samples_check, 2258 samples=get_samples, 2259 samples_force=samples_force, 2260 ) 2261 else: 2262 sample_list = None 2263 2264 # Export file 2265 database.export( 2266 output_database=output_file, 2267 output_header=output_header, 2268 existing_columns_header=existing_columns_header, 2269 parquet_partitions=parquet_partitions, 2270 chunk_size=chunk_size, 2271 threads=threads, 2272 sort=sort, 2273 index=index, 2274 header_in_output=header_in_output, 2275 order_by=order_by, 2276 query=query, 2277 export_header=export_header, 2278 sample_list=sample_list, 2279 ) 2280 2281 # Remove 2282 remove_if_exists(tmp_to_remove) 2283 2284 return (os.path.exists(output_file) or None) and ( 2285 os.path.exists(output_file) or None 2286 )
The export_output function exports data from a VCF file to a specified output file in various
formats, including VCF, CSV, TSV, PSV, and Parquet.
Parameters
- output_file: The
output_fileparameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved - output_header: The
output_headerparameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as theoutput_fileparameter, but with the extension " - export_header: The
export_headerparameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. Ifexport_headeris True, the header will be exported to a file. Ifexport_headeris False, the header will not be, defaults to True, if output format is not VCF - query: The
queryparameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported - parquet_partitions: The
parquet_partitionsparameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets - chunk_size: The
chunk_sizeparameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files. - threads: The
threadsparameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads - sort: The
sortparameter is a boolean flag that determines whether the output file should be sorted or not. Ifsortis set toTrue, the output file will be sorted based on the genomic coordinates of the variants. By default, the value ofsortisFalse, defaults to False - index: The
indexparameter is a boolean flag that determines whether an index should be created on the output file. Ifindexis True, an index will be created. Ifindexis False, no index will be created. The default value is False, defaults to False - order_by: The
order_byparameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns
a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.
2288 def get_extra_infos(self, table: str = None) -> list: 2289 """ 2290 The `get_extra_infos` function returns a list of columns that are in a specified table but not 2291 in the header. 2292 2293 :param table: The `table` parameter in the `get_extra_infos` function is used to specify the 2294 name of the table from which you want to retrieve the extra columns that are not present in the 2295 header. If the `table` parameter is not provided when calling the function, it will default to 2296 using the variants 2297 :type table: str 2298 :return: A list of columns that are in the specified table but not in the header of the table. 2299 """ 2300 2301 header_columns = [] 2302 2303 if not table: 2304 table = self.get_table_variants(clause="from") 2305 header_columns = self.get_header_columns() 2306 2307 # Check all columns in the database 2308 query = f""" SELECT * FROM {table} LIMIT 1 """ 2309 log.debug(f"query {query}") 2310 table_columns = self.get_query_to_df(query).columns.tolist() 2311 extra_columns = [] 2312 2313 # Construct extra infos (not in header) 2314 for column in table_columns: 2315 if column not in header_columns: 2316 extra_columns.append(column) 2317 2318 return extra_columns
The get_extra_infos function returns a list of columns that are in a specified table but not
in the header.
Parameters
- table: The
tableparameter in theget_extra_infosfunction is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If thetableparameter is not provided when calling the function, it will default to using the variants
Returns
A list of columns that are in the specified table but not in the header of the table.
2320 def get_extra_infos_sql(self, table: str = None) -> str: 2321 """ 2322 It returns a string of the extra infos, separated by commas, and each extra info is surrounded 2323 by double quotes 2324 2325 :param table: The name of the table to get the extra infos from. If None, the default table is 2326 used 2327 :type table: str 2328 :return: A string of the extra infos 2329 """ 2330 2331 return ", ".join( 2332 ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)] 2333 )
It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes
Parameters
- table: The name of the table to get the extra infos from. If None, the default table is used
Returns
A string of the extra infos
2335 def export_header( 2336 self, 2337 header_name: str = None, 2338 output_file: str = None, 2339 output_file_ext: str = ".hdr", 2340 clean_header: bool = True, 2341 remove_chrom_line: bool = False, 2342 ) -> str: 2343 """ 2344 The `export_header` function takes a VCF file, extracts the header, modifies it according to 2345 specified options, and writes it to a new file. 2346 2347 :param header_name: The `header_name` parameter is the name of the header file to be created. If 2348 this parameter is not specified, the header will be written to the output file 2349 :type header_name: str 2350 :param output_file: The `output_file` parameter in the `export_header` function is used to 2351 specify the name of the output file where the header will be written. If this parameter is not 2352 provided, the header will be written to a temporary file 2353 :type output_file: str 2354 :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a 2355 string that represents the extension of the output header file. By default, it is set to ".hdr" 2356 if not specified by the user. This extension will be appended to the `output_file` name to 2357 create the final, defaults to .hdr 2358 :type output_file_ext: str (optional) 2359 :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean 2360 flag that determines whether the header should be cleaned or not. When `clean_header` is set to 2361 `True`, the function will clean the header by modifying certain lines based on a specific 2362 pattern. If `clean_header`, defaults to True 2363 :type clean_header: bool (optional) 2364 :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a 2365 boolean flag that determines whether the #CHROM line should be removed from the header before 2366 writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `, 2367 defaults to False 2368 :type remove_chrom_line: bool (optional) 2369 :return: The function `export_header` returns the name of the temporary header file that is 2370 created. 2371 """ 2372 2373 if not header_name and not output_file: 2374 output_file = self.get_output() 2375 2376 if self.get_header(): 2377 2378 # Get header object 2379 header_obj = self.get_header() 2380 2381 # Create database 2382 db_for_header = Database(database=self.get_input()) 2383 2384 # Get real columns in the file 2385 db_header_columns = db_for_header.get_columns() 2386 2387 with tempfile.TemporaryDirectory() as tmpdir: 2388 2389 # Write header file 2390 header_file_tmp = os.path.join(tmpdir, "header") 2391 f = open(header_file_tmp, "w") 2392 vcf.Writer(f, header_obj) 2393 f.close() 2394 2395 # Replace #CHROM line with rel columns 2396 header_list = db_for_header.read_header_file( 2397 header_file=header_file_tmp 2398 ) 2399 header_list[-1] = "\t".join(db_header_columns) 2400 2401 # Remove CHROM line 2402 if remove_chrom_line: 2403 header_list.pop() 2404 2405 # Clean header 2406 if clean_header: 2407 header_list_clean = [] 2408 for head in header_list: 2409 # Clean head for malformed header 2410 head_clean = head 2411 head_clean = re.subn( 2412 "##FORMAT=<ID=(.*),Number=(.*),Type=Flag", 2413 r"##FORMAT=<ID=\1,Number=\2,Type=String", 2414 head_clean, 2415 2, 2416 )[0] 2417 # Write header 2418 header_list_clean.append(head_clean) 2419 header_list = header_list_clean 2420 2421 tmp_header_name = output_file + output_file_ext 2422 2423 f = open(tmp_header_name, "w") 2424 for line in header_list: 2425 f.write(line) 2426 f.close() 2427 2428 return tmp_header_name
The export_header function takes a VCF file, extracts the header, modifies it according to
specified options, and writes it to a new file.
Parameters
- header_name: The
header_nameparameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file - output_file: The
output_fileparameter in theexport_headerfunction is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file - output_file_ext: The
output_file_extparameter in theexport_headerfunction is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to theoutput_filename to create the final, defaults to .hdr - clean_header: The
clean_headerparameter in theexport_headerfunction is a boolean flag that determines whether the header should be cleaned or not. Whenclean_headeris set toTrue, the function will clean the header by modifying certain lines based on a specific pattern. Ifclean_header, defaults to True - remove_chrom_line: The
remove_chrom_lineparameter in theexport_headerfunction is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set toTrue, the #CHROM line will be removed; if set to `, defaults to False
Returns
The function
export_headerreturns the name of the temporary header file that is created.
2430 def export_variant_vcf( 2431 self, 2432 vcf_file, 2433 remove_info: bool = False, 2434 add_samples: bool = True, 2435 list_samples: list = [], 2436 where_clause: str = "", 2437 index: bool = False, 2438 threads: int | None = None, 2439 ) -> bool | None: 2440 """ 2441 The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to 2442 remove INFO field, add samples, and control compression and indexing. 2443 2444 :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be 2445 written to. It is the output file that will contain the filtered VCF data based on the specified 2446 parameters 2447 :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a 2448 boolean flag that determines whether to remove the INFO field from the output VCF file. If set 2449 to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included 2450 in, defaults to False 2451 :type remove_info: bool (optional) 2452 :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether 2453 the samples should be added to the VCF file or not. If set to True, the samples will be added. 2454 If set to False, the samples will be removed. The default value is True, defaults to True 2455 :type add_samples: bool (optional) 2456 :param list_samples: The `list_samples` parameter is a list of samples that you want to include 2457 in the output VCF file. By default, all samples will be included. If you provide a list of 2458 samples, only those samples will be included in the output file 2459 :type list_samples: list 2460 :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that 2461 determines whether or not to create an index for the output VCF file. If `index` is set to 2462 `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False 2463 :type index: bool (optional) 2464 :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the 2465 number of threads to use for exporting the VCF file. It determines how many parallel threads 2466 will be used during the export process. More threads can potentially speed up the export process 2467 by utilizing multiple cores of the processor. If 2468 :type threads: int | None 2469 :return: The `export_variant_vcf` function returns the result of calling the `export_output` 2470 method with various parameters including the output file, query, threads, sort flag, and index 2471 flag. The `export_output` method is responsible for exporting the VCF data based on the 2472 specified parameters and configurations provided in the `export_variant_vcf` function. 2473 """ 2474 2475 # Config 2476 config = self.get_config() 2477 2478 # Extract VCF 2479 log.debug("Export VCF...") 2480 2481 # Table variants 2482 table_variants = self.get_table_variants() 2483 2484 # Threads 2485 if not threads: 2486 threads = self.get_threads() 2487 2488 # Info fields 2489 if remove_info: 2490 if not isinstance(remove_info, str): 2491 remove_info = "." 2492 info_field = f"""'{remove_info}' as INFO""" 2493 else: 2494 info_field = "INFO" 2495 2496 # Samples fields 2497 if add_samples: 2498 if not list_samples: 2499 list_samples = self.get_header_sample_list() 2500 if list_samples: 2501 samples_fields = " , FORMAT , " + " , ".join(list_samples) 2502 else: 2503 samples_fields = "" 2504 log.debug(f"samples_fields: {samples_fields}") 2505 else: 2506 samples_fields = "" 2507 2508 # Where clause 2509 if where_clause is None: 2510 where_clause = "" 2511 2512 # Variants 2513 select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """ 2514 sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """ 2515 log.debug(f"sql_query_select={sql_query_select}") 2516 2517 return self.export_output( 2518 output_file=vcf_file, 2519 output_header=None, 2520 export_header=True, 2521 query=sql_query_select, 2522 parquet_partitions=None, 2523 chunk_size=config.get("chunk_size", None), 2524 threads=threads, 2525 sort=True, 2526 index=index, 2527 order_by=None, 2528 )
The export_variant_vcf function exports a VCF file with specified samples, allowing options to
remove INFO field, add samples, and control compression and indexing.
Parameters
- vcf_file: The
vcf_fileparameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters - remove_info: The
remove_infoparameter in theexport_variant_vcffunction is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set toTrue, the INFO field will be removed. If set toFalse, the INFO field will be included in, defaults to False - add_samples: The
add_samplesparameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True - list_samples: The
list_samplesparameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file - index: The
indexparameter in theexport_variant_vcffunction is a boolean flag that determines whether or not to create an index for the output VCF file. Ifindexis set toTrue, the output VCF file will be indexed using tabix. Ifindex, defaults to False - threads: The
threadsparameter in theexport_variant_vcffunction specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns
The
export_variant_vcffunction returns the result of calling theexport_outputmethod with various parameters including the output file, query, threads, sort flag, and index flag. Theexport_outputmethod is responsible for exporting the VCF data based on the specified parameters and configurations provided in theexport_variant_vcffunction.
2530 def run_commands(self, commands: list = [], threads: int = 1) -> None: 2531 """ 2532 It takes a list of commands and runs them in parallel using the number of threads specified 2533 2534 :param commands: A list of commands to run 2535 :param threads: The number of threads to use, defaults to 1 (optional) 2536 """ 2537 2538 run_parallel_commands(commands, threads)
It takes a list of commands and runs them in parallel using the number of threads specified
Parameters
- commands: A list of commands to run
- threads: The number of threads to use, defaults to 1 (optional)
2540 def get_threads(self, default: int = 1) -> int: 2541 """ 2542 This function returns the number of threads to use for a job, with a default value of 1 if not 2543 specified. 2544 2545 :param default: The `default` parameter in the `get_threads` method is used to specify the 2546 default number of threads to use if no specific value is provided. If no value is provided for 2547 the `threads` parameter in the configuration or input parameters, the `default` value will be 2548 used, defaults to 1 2549 :type default: int (optional) 2550 :return: the number of threads to use for the current job. 2551 """ 2552 2553 # Config 2554 config = self.get_config() 2555 2556 # Param 2557 param = self.get_param() 2558 2559 # Input threads 2560 input_thread = param.get("threads", config.get("threads", None)) 2561 2562 # Check threads 2563 if not input_thread: 2564 threads = default 2565 elif int(input_thread) <= 0: 2566 threads = os.cpu_count() 2567 else: 2568 threads = int(input_thread) 2569 return threads
This function returns the number of threads to use for a job, with a default value of 1 if not specified.
Parameters
- default: The
defaultparameter in theget_threadsmethod is used to specify the default number of threads to use if no specific value is provided. If no value is provided for thethreadsparameter in the configuration or input parameters, thedefaultvalue will be used, defaults to 1
Returns
the number of threads to use for the current job.
2571 def get_memory(self, default: str = None) -> str: 2572 """ 2573 This function retrieves the memory value from parameters or configuration with a default value 2574 if not found. 2575 2576 :param default: The `get_memory` function takes in a default value as a string parameter. This 2577 default value is used as a fallback in case the `memory` parameter is not provided in the 2578 `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary, 2579 the function 2580 :type default: str 2581 :return: The `get_memory` function returns a string value representing the memory parameter. If 2582 the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will 2583 return the default value provided as an argument to the function. 2584 """ 2585 2586 # Config 2587 config = self.get_config() 2588 2589 # Param 2590 param = self.get_param() 2591 2592 # Input threads 2593 input_memory = param.get("memory", config.get("memory", None)) 2594 2595 # Check threads 2596 if input_memory: 2597 memory = input_memory 2598 else: 2599 memory = default 2600 2601 return memory
This function retrieves the memory value from parameters or configuration with a default value if not found.
Parameters
- default: The
get_memoryfunction takes in a default value as a string parameter. This default value is used as a fallback in case thememoryparameter is not provided in theparamdictionary or theconfigdictionary. Ifmemoryis not found in either dictionary, the function
Returns
The
get_memoryfunction returns a string value representing the memory parameter. If theinput_memoryis provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.
2603 def update_from_vcf(self, vcf_file: str) -> None: 2604 """ 2605 > If the database is duckdb, then use the parquet method, otherwise use the sqlite method 2606 2607 :param vcf_file: the path to the VCF file 2608 """ 2609 2610 connexion_format = self.get_connexion_format() 2611 2612 if connexion_format in ["duckdb"]: 2613 self.update_from_vcf_duckdb(vcf_file) 2614 elif connexion_format in ["sqlite"]: 2615 self.update_from_vcf_sqlite(vcf_file)
If the database is duckdb, then use the parquet method, otherwise use the sqlite method
Parameters
- vcf_file: the path to the VCF file
2617 def update_from_vcf_duckdb(self, vcf_file: str) -> None: 2618 """ 2619 It takes a VCF file and updates the INFO column of the variants table in the database with the 2620 INFO column of the VCF file 2621 2622 :param vcf_file: the path to the VCF file 2623 """ 2624 2625 # varaints table 2626 table_variants = self.get_table_variants() 2627 2628 # Loading VCF into temporaire table 2629 skip = self.get_header_length(file=vcf_file) 2630 vcf_df = pd.read_csv( 2631 vcf_file, 2632 sep="\t", 2633 engine="c", 2634 skiprows=skip, 2635 header=0, 2636 low_memory=False, 2637 ) 2638 sql_query_update = f""" 2639 UPDATE {table_variants} as table_variants 2640 SET INFO = concat( 2641 CASE 2642 WHEN INFO NOT IN ('', '.') 2643 THEN INFO 2644 ELSE '' 2645 END, 2646 ( 2647 SELECT 2648 concat( 2649 CASE 2650 WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.') 2651 THEN ';' 2652 ELSE '' 2653 END 2654 , 2655 CASE 2656 WHEN table_parquet.INFO NOT IN ('','.') 2657 THEN table_parquet.INFO 2658 ELSE '' 2659 END 2660 ) 2661 FROM vcf_df as table_parquet 2662 WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR) 2663 AND table_parquet.\"POS\" = table_variants.\"POS\" 2664 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 2665 AND table_parquet.\"REF\" = table_variants.\"REF\" 2666 AND table_parquet.INFO NOT IN ('','.') 2667 ) 2668 ) 2669 ; 2670 """ 2671 self.conn.execute(sql_query_update)
It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file
Parameters
- vcf_file: the path to the VCF file
2673 def update_from_vcf_sqlite(self, vcf_file: str) -> None: 2674 """ 2675 It creates a temporary table in the SQLite database, loads the VCF file into the temporary 2676 table, then updates the INFO column of the variants table with the INFO column of the temporary 2677 table 2678 2679 :param vcf_file: The path to the VCF file you want to update the database with 2680 """ 2681 2682 # Create a temporary table for the VCF 2683 table_vcf = "tmp_vcf" 2684 sql_create = ( 2685 f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0" 2686 ) 2687 self.conn.execute(sql_create) 2688 2689 # Loading VCF into temporaire table 2690 vcf_df = pd.read_csv( 2691 vcf_file, sep="\t", comment="#", header=None, low_memory=False 2692 ) 2693 vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"] 2694 vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False) 2695 2696 # Update table 'variants' with VCF data 2697 # warning: CONCAT as || operator 2698 sql_query_update = f""" 2699 UPDATE variants as table_variants 2700 SET INFO = CASE 2701 WHEN INFO NOT IN ('', '.') 2702 THEN INFO 2703 ELSE '' 2704 END || 2705 ( 2706 SELECT 2707 CASE 2708 WHEN table_variants.INFO NOT IN ('','.') 2709 AND table_vcf.INFO NOT IN ('','.') 2710 THEN ';' 2711 ELSE '' 2712 END || 2713 CASE 2714 WHEN table_vcf.INFO NOT IN ('','.') 2715 THEN table_vcf.INFO 2716 ELSE '' 2717 END 2718 FROM {table_vcf} as table_vcf 2719 WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\" 2720 AND table_vcf.\"POS\" = table_variants.\"POS\" 2721 AND table_vcf.\"ALT\" = table_variants.\"ALT\" 2722 AND table_vcf.\"REF\" = table_variants.\"REF\" 2723 ) 2724 """ 2725 self.conn.execute(sql_query_update) 2726 2727 # Drop temporary table 2728 sql_drop = f"DROP TABLE {table_vcf}" 2729 self.conn.execute(sql_drop)
It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table
Parameters
- vcf_file: The path to the VCF file you want to update the database with
2731 def drop_variants_table(self) -> None: 2732 """ 2733 > This function drops the variants table 2734 """ 2735 2736 table_variants = self.get_table_variants() 2737 sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}" 2738 self.conn.execute(sql_table_variants)
This function drops the variants table
2740 def set_variant_id( 2741 self, variant_id_column: str = "variant_id", force: bool = None 2742 ) -> str: 2743 """ 2744 It adds a column to the variants table called `variant_id` and populates it with a hash of the 2745 `#CHROM`, `POS`, `REF`, and `ALT` columns 2746 2747 :param variant_id_column: The name of the column to be created in the variants table, defaults 2748 to variant_id 2749 :type variant_id_column: str (optional) 2750 :param force: If True, the variant_id column will be created even if it already exists 2751 :type force: bool 2752 :return: The name of the column that contains the variant_id 2753 """ 2754 2755 # Assembly 2756 assembly = self.get_param().get( 2757 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 2758 ) 2759 2760 # INFO/Tag prefix 2761 prefix = self.get_explode_infos_prefix() 2762 2763 # Explode INFO/SVTYPE 2764 added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"]) 2765 2766 # variants table 2767 table_variants = self.get_table_variants() 2768 2769 # variant_id column 2770 if not variant_id_column: 2771 variant_id_column = "variant_id" 2772 2773 # Creta variant_id column 2774 if "variant_id" not in self.get_extra_infos() or force: 2775 2776 # Create column 2777 self.add_column( 2778 table_name=table_variants, 2779 column_name=variant_id_column, 2780 column_type="UBIGINT", 2781 default_value="0", 2782 ) 2783 2784 # Update column 2785 self.conn.execute( 2786 f""" 2787 UPDATE {table_variants} 2788 SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"') 2789 """ 2790 ) 2791 2792 # Remove added columns 2793 for added_column in added_columns: 2794 self.drop_column(column=added_column) 2795 2796 # return variant_id column name 2797 return variant_id_column
It adds a column to the variants table called variant_id and populates it with a hash of the
#CHROM, POS, REF, and ALT columns
Parameters
- variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
- force: If True, the variant_id column will be created even if it already exists
Returns
The name of the column that contains the variant_id
2799 def get_variant_id_column( 2800 self, variant_id_column: str = "variant_id", force: bool = None 2801 ) -> str: 2802 """ 2803 This function returns the variant_id column name 2804 2805 :param variant_id_column: The name of the column in the dataframe that contains the variant IDs, 2806 defaults to variant_id 2807 :type variant_id_column: str (optional) 2808 :param force: If True, will force the variant_id to be set to the value of variant_id_column. If 2809 False, will only set the variant_id if it is not already set. If None, will set the variant_id 2810 if it is not already set, or if it is set 2811 :type force: bool 2812 :return: The variant_id column name. 2813 """ 2814 2815 return self.set_variant_id(variant_id_column=variant_id_column, force=force)
This function returns the variant_id column name
Parameters
- variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
- force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns
The variant_id column name.
2821 def scan_databases( 2822 self, 2823 database_formats: list = ["parquet"], 2824 database_releases: list = ["current"], 2825 ) -> dict: 2826 """ 2827 The function `scan_databases` scans for available databases based on specified formats and 2828 releases. 2829 2830 :param database_formats: The `database_formats` parameter is a list that specifies the formats 2831 of the databases to be scanned. In this case, the accepted format is "parquet" 2832 :type database_formats: list ["parquet"] 2833 :param database_releases: The `database_releases` parameter is a list that specifies the 2834 releases of the databases to be scanned. In the provided function, the default value for 2835 `database_releases` is set to `["current"]`, meaning that by default, the function will scan 2836 databases that are in the "current" 2837 :type database_releases: list 2838 :return: The function `scan_databases` returns a dictionary containing information about 2839 databases that match the specified formats and releases. 2840 """ 2841 2842 # Config 2843 config = self.get_config() 2844 2845 # Param 2846 param = self.get_param() 2847 2848 # Param - Assembly 2849 assembly = param.get("assembly", config.get("assembly", None)) 2850 if not assembly: 2851 assembly = DEFAULT_ASSEMBLY 2852 log.warning(f"Default assembly '{assembly}'") 2853 2854 # Scan for availabled databases 2855 log.info( 2856 f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..." 2857 ) 2858 databases_infos_dict = databases_infos( 2859 database_folder_releases=database_releases, 2860 database_formats=database_formats, 2861 assembly=assembly, 2862 config=config, 2863 ) 2864 log.info( 2865 f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found" 2866 ) 2867 2868 return databases_infos_dict
The function scan_databases scans for available databases based on specified formats and
releases.
Parameters
- database_formats: The
database_formatsparameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet" - database_releases: The
database_releasesparameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value fordatabase_releasesis set to["current"], meaning that by default, the function will scan databases that are in the "current"
Returns
The function
scan_databasesreturns a dictionary containing information about databases that match the specified formats and releases.
2870 def annotation(self) -> None: 2871 """ 2872 It annotates the VCF file with the annotations specified in the config file. 2873 """ 2874 2875 # Config 2876 config = self.get_config() 2877 2878 # Param 2879 param = self.get_param() 2880 2881 # Param - Assembly 2882 assembly = param.get("assembly", config.get("assembly", None)) 2883 if not assembly: 2884 assembly = DEFAULT_ASSEMBLY 2885 log.warning(f"Default assembly '{assembly}'") 2886 2887 # annotations databases folders 2888 annotations_databases = set( 2889 config.get("folders", {}) 2890 .get("databases", {}) 2891 .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER]) 2892 + config.get("folders", {}) 2893 .get("databases", {}) 2894 .get("parquet", ["~/howard/databases/parquet/current"]) 2895 + config.get("folders", {}) 2896 .get("databases", {}) 2897 .get("bcftools", ["~/howard/databases/bcftools/current"]) 2898 ) 2899 2900 # Get param annotations 2901 if param.get("annotations", None) and isinstance( 2902 param.get("annotations", None), str 2903 ): 2904 log.debug(param.get("annotations", None)) 2905 param_annotation_list = param.get("annotations").split(",") 2906 else: 2907 param_annotation_list = [] 2908 2909 # Each tools param 2910 if param.get("annotation_parquet", None) != None: 2911 log.debug( 2912 f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}""" 2913 ) 2914 if isinstance(param.get("annotation_parquet", None), list): 2915 param_annotation_list.append(",".join(param.get("annotation_parquet"))) 2916 else: 2917 param_annotation_list.append(param.get("annotation_parquet")) 2918 if param.get("annotation_snpsift", None) != None: 2919 if isinstance(param.get("annotation_snpsift", None), list): 2920 param_annotation_list.append( 2921 "snpsift:" 2922 + "+".join(param.get("annotation_snpsift")).replace(",", "+") 2923 ) 2924 else: 2925 param_annotation_list.append( 2926 "snpsift:" + param.get("annotation_snpsift").replace(",", "+") 2927 ) 2928 if param.get("annotation_snpeff", None) != None: 2929 param_annotation_list.append("snpeff:" + param.get("annotation_snpeff")) 2930 if param.get("annotation_bcftools", None) != None: 2931 if isinstance(param.get("annotation_bcftools", None), list): 2932 param_annotation_list.append( 2933 "bcftools:" 2934 + "+".join(param.get("annotation_bcftools")).replace(",", "+") 2935 ) 2936 else: 2937 param_annotation_list.append( 2938 "bcftools:" + param.get("annotation_bcftools").replace(",", "+") 2939 ) 2940 if param.get("annotation_annovar", None) != None: 2941 param_annotation_list.append("annovar:" + param.get("annotation_annovar")) 2942 if param.get("annotation_exomiser", None) != None: 2943 param_annotation_list.append("exomiser:" + param.get("annotation_exomiser")) 2944 if param.get("annotation_splice", None) != None: 2945 param_annotation_list.append("splice:" + param.get("annotation_splice")) 2946 2947 # Merge param annotations list 2948 param["annotations"] = ",".join(param_annotation_list) 2949 2950 # debug 2951 log.debug(f"param_annotations={param['annotations']}") 2952 2953 if param.get("annotations"): 2954 2955 # Log 2956 # log.info("Annotations - Check annotation parameters") 2957 2958 if not "annotation" in param: 2959 param["annotation"] = {} 2960 2961 # List of annotations parameters 2962 annotations_list_input = {} 2963 if isinstance(param.get("annotations", None), str): 2964 annotation_file_list = [ 2965 value for value in param.get("annotations", "").split(",") 2966 ] 2967 for annotation_file in annotation_file_list: 2968 annotations_list_input[annotation_file.strip()] = {"INFO": None} 2969 else: 2970 annotations_list_input = param.get("annotations", {}) 2971 2972 log.info(f"Quick Annotations:") 2973 for annotation_key in list(annotations_list_input.keys()): 2974 log.info(f" {annotation_key}") 2975 2976 # List of annotations and associated fields 2977 annotations_list = {} 2978 2979 for annotation_file in annotations_list_input: 2980 2981 # Explode annotations if ALL 2982 if ( 2983 annotation_file.upper() == "ALL" 2984 or annotation_file.upper().startswith("ALL:") 2985 ): 2986 2987 # check ALL parameters (formats, releases) 2988 annotation_file_split = annotation_file.split(":") 2989 database_formats = "parquet" 2990 database_releases = "current" 2991 for annotation_file_option in annotation_file_split[1:]: 2992 database_all_options_split = annotation_file_option.split("=") 2993 if database_all_options_split[0] == "format": 2994 database_formats = database_all_options_split[1].split("+") 2995 if database_all_options_split[0] == "release": 2996 database_releases = database_all_options_split[1].split("+") 2997 2998 # Scan for availabled databases 2999 databases_infos_dict = self.scan_databases( 3000 database_formats=database_formats, 3001 database_releases=database_releases, 3002 ) 3003 3004 # Add found databases in annotation parameters 3005 for database_infos in databases_infos_dict.keys(): 3006 annotations_list[database_infos] = {"INFO": None} 3007 3008 else: 3009 annotations_list[annotation_file] = annotations_list_input[ 3010 annotation_file 3011 ] 3012 3013 # Check each databases 3014 if len(annotations_list): 3015 3016 log.info( 3017 f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..." 3018 ) 3019 3020 for annotation_file in annotations_list: 3021 3022 # Init 3023 annotations = annotations_list.get(annotation_file, None) 3024 3025 # Annotation snpEff 3026 if annotation_file.startswith("snpeff"): 3027 3028 log.debug(f"Quick Annotation snpEff") 3029 3030 if "snpeff" not in param["annotation"]: 3031 param["annotation"]["snpeff"] = {} 3032 3033 if "options" not in param["annotation"]["snpeff"]: 3034 param["annotation"]["snpeff"]["options"] = "" 3035 3036 # snpEff options in annotations 3037 param["annotation"]["snpeff"]["options"] = "".join( 3038 annotation_file.split(":")[1:] 3039 ) 3040 3041 # Annotation Annovar 3042 elif annotation_file.startswith("annovar"): 3043 3044 log.debug(f"Quick Annotation Annovar") 3045 3046 if "annovar" not in param["annotation"]: 3047 param["annotation"]["annovar"] = {} 3048 3049 if "annotations" not in param["annotation"]["annovar"]: 3050 param["annotation"]["annovar"]["annotations"] = {} 3051 3052 # Options 3053 annotation_file_split = annotation_file.split(":") 3054 for annotation_file_annotation in annotation_file_split[1:]: 3055 if annotation_file_annotation: 3056 param["annotation"]["annovar"]["annotations"][ 3057 annotation_file_annotation 3058 ] = annotations 3059 3060 # Annotation Exomiser 3061 elif annotation_file.startswith("exomiser"): 3062 3063 log.debug(f"Quick Annotation Exomiser") 3064 3065 param["annotation"]["exomiser"] = params_string_to_dict( 3066 annotation_file 3067 ) 3068 3069 # Annotation Splice 3070 elif annotation_file.startswith("splice"): 3071 3072 log.debug(f"Quick Annotation Splice") 3073 3074 param["annotation"]["splice"] = params_string_to_dict( 3075 annotation_file 3076 ) 3077 3078 # Annotation Parquet or BCFTOOLS 3079 else: 3080 3081 # Tools detection 3082 if annotation_file.startswith("bcftools:"): 3083 annotation_tool_initial = "bcftools" 3084 annotation_file = ":".join(annotation_file.split(":")[1:]) 3085 elif annotation_file.startswith("snpsift:"): 3086 annotation_tool_initial = "snpsift" 3087 annotation_file = ":".join(annotation_file.split(":")[1:]) 3088 elif annotation_file.startswith("bigwig:"): 3089 annotation_tool_initial = "bigwig" 3090 annotation_file = ":".join(annotation_file.split(":")[1:]) 3091 else: 3092 annotation_tool_initial = None 3093 3094 # list of files 3095 annotation_file_list = annotation_file.replace("+", ":").split( 3096 ":" 3097 ) 3098 3099 for annotation_file in annotation_file_list: 3100 3101 if annotation_file: 3102 3103 # Annotation tool initial 3104 annotation_tool = annotation_tool_initial 3105 3106 # Find file 3107 annotation_file_found = None 3108 3109 if os.path.exists(annotation_file): 3110 annotation_file_found = annotation_file 3111 elif os.path.exists(full_path(annotation_file)): 3112 annotation_file_found = full_path(annotation_file) 3113 else: 3114 # Find within assembly folders 3115 for annotations_database in annotations_databases: 3116 found_files = find_all( 3117 annotation_file, 3118 os.path.join( 3119 annotations_database, assembly 3120 ), 3121 ) 3122 if len(found_files) > 0: 3123 annotation_file_found = found_files[0] 3124 break 3125 if not annotation_file_found and not assembly: 3126 # Find within folders 3127 for ( 3128 annotations_database 3129 ) in annotations_databases: 3130 found_files = find_all( 3131 annotation_file, annotations_database 3132 ) 3133 if len(found_files) > 0: 3134 annotation_file_found = found_files[0] 3135 break 3136 log.debug( 3137 f"for {annotation_file} annotation_file_found={annotation_file_found}" 3138 ) 3139 3140 # Full path 3141 annotation_file_found = full_path(annotation_file_found) 3142 3143 if annotation_file_found: 3144 3145 database = Database(database=annotation_file_found) 3146 quick_annotation_format = database.get_format() 3147 quick_annotation_is_compressed = ( 3148 database.is_compressed() 3149 ) 3150 quick_annotation_is_indexed = os.path.exists( 3151 f"{annotation_file_found}.tbi" 3152 ) 3153 bcftools_preference = False 3154 3155 # Check Annotation Tool 3156 if not annotation_tool: 3157 if ( 3158 bcftools_preference 3159 and quick_annotation_format 3160 in ["vcf", "bed"] 3161 and quick_annotation_is_compressed 3162 and quick_annotation_is_indexed 3163 ): 3164 annotation_tool = "bcftools" 3165 elif quick_annotation_format in [ 3166 "vcf", 3167 "bed", 3168 "tsv", 3169 "tsv", 3170 "csv", 3171 "json", 3172 "tbl", 3173 "parquet", 3174 "duckdb", 3175 ]: 3176 annotation_tool = "parquet" 3177 elif quick_annotation_format in [ 3178 "bw" 3179 ]: 3180 annotation_tool = "bigwig" 3181 else: 3182 log.error( 3183 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3184 ) 3185 raise ValueError( 3186 f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet" 3187 ) 3188 3189 log.debug( 3190 f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}" 3191 ) 3192 3193 # Annotation Tool dispatch 3194 if annotation_tool: 3195 if annotation_tool not in param["annotation"]: 3196 param["annotation"][annotation_tool] = {} 3197 if ( 3198 "annotations" 3199 not in param["annotation"][annotation_tool] 3200 ): 3201 param["annotation"][annotation_tool][ 3202 "annotations" 3203 ] = {} 3204 param["annotation"][annotation_tool][ 3205 "annotations" 3206 ][annotation_file_found] = annotations 3207 3208 else: 3209 log.warning( 3210 f"Quick Annotation File {annotation_file} does NOT exist" 3211 ) 3212 3213 self.set_param(param) 3214 3215 if param.get("annotation", None): 3216 log.info("Annotations") 3217 if param.get("annotation", {}).get("parquet", None): 3218 log.info("Annotations 'parquet'...") 3219 self.annotation_parquet() 3220 if param.get("annotation", {}).get("bcftools", None): 3221 log.info("Annotations 'bcftools'...") 3222 self.annotation_bcftools() 3223 if param.get("annotation", {}).get("snpsift", None): 3224 log.info("Annotations 'snpsift'...") 3225 self.annotation_snpsift() 3226 if param.get("annotation", {}).get("bigwig", None): 3227 log.info("Annotations 'bigwig'...") 3228 self.annotation_bigwig() 3229 if param.get("annotation", {}).get("annovar", None): 3230 log.info("Annotations 'annovar'...") 3231 self.annotation_annovar() 3232 if param.get("annotation", {}).get("snpeff", None): 3233 log.info("Annotations 'snpeff'...") 3234 self.annotation_snpeff() 3235 if param.get("annotation", {}).get("exomiser", None) is not None: 3236 log.info("Annotations 'exomiser'...") 3237 self.annotation_exomiser() 3238 if param.get("annotation", {}).get("splice", None) is not None: 3239 log.info("Annotations 'splice' ...") 3240 self.annotation_splice() 3241 3242 # Explode INFOS fields into table fields 3243 if self.get_explode_infos(): 3244 self.explode_infos( 3245 prefix=self.get_explode_infos_prefix(), 3246 fields=self.get_explode_infos_fields(), 3247 force=True, 3248 )
It annotates the VCF file with the annotations specified in the config file.
3251 def annotation_bigwig(self, threads: int = None) -> None: 3252 """ 3253 The function `annotation_bigwig` annotates variants in a VCF file using bigwig databases. 3254 3255 :param threads: The `threads` parameter in the `annotation_bigwig` method is used to specify the 3256 number of threads to be used for parallel processing during the annotation process. If the 3257 `threads` parameter is not provided, the method will attempt to determine the optimal number of 3258 threads to use based on the system configuration 3259 :type threads: int 3260 :return: True 3261 """ 3262 3263 # DEBUG 3264 log.debug("Start annotation with bigwig databases") 3265 3266 # # Threads 3267 # if not threads: 3268 # threads = self.get_threads() 3269 # log.debug("Threads: " + str(threads)) 3270 3271 # Config 3272 config = self.get_config() 3273 log.debug("Config: " + str(config)) 3274 3275 # Config - BCFTools databases folders 3276 databases_folders = set( 3277 self.get_config() 3278 .get("folders", {}) 3279 .get("databases", {}) 3280 .get("annotations", ["."]) 3281 + self.get_config() 3282 .get("folders", {}) 3283 .get("databases", {}) 3284 .get("bigwig", ["."]) 3285 ) 3286 log.debug("Databases annotations: " + str(databases_folders)) 3287 3288 # Param 3289 annotations = ( 3290 self.get_param() 3291 .get("annotation", {}) 3292 .get("bigwig", {}) 3293 .get("annotations", None) 3294 ) 3295 log.debug("Annotations: " + str(annotations)) 3296 3297 # Assembly 3298 assembly = self.get_param().get( 3299 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3300 ) 3301 3302 # Data 3303 table_variants = self.get_table_variants() 3304 3305 # Check if not empty 3306 log.debug("Check if not empty") 3307 sql_query_chromosomes = ( 3308 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3309 ) 3310 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3311 if not sql_query_chromosomes_df["count"][0]: 3312 log.info(f"VCF empty") 3313 return 3314 3315 # VCF header 3316 vcf_reader = self.get_header() 3317 log.debug("Initial header: " + str(vcf_reader.infos)) 3318 3319 # Existing annotations 3320 for vcf_annotation in self.get_header().infos: 3321 3322 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3323 log.debug( 3324 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3325 ) 3326 3327 if annotations: 3328 3329 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3330 3331 # Export VCF file 3332 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3333 3334 # annotation_bigwig_config 3335 annotation_bigwig_config_list = [] 3336 3337 for annotation in annotations: 3338 annotation_fields = annotations[annotation] 3339 3340 # Annotation Name 3341 annotation_name = os.path.basename(annotation) 3342 3343 if not annotation_fields: 3344 annotation_fields = {"INFO": None} 3345 3346 log.debug(f"Annotation '{annotation_name}'") 3347 log.debug( 3348 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3349 ) 3350 3351 # Create Database 3352 database = Database( 3353 database=annotation, 3354 databases_folders=databases_folders, 3355 assembly=assembly, 3356 ) 3357 3358 # Find files 3359 db_file = database.get_database() 3360 db_file = full_path(db_file) 3361 db_hdr_file = database.get_header_file() 3362 db_hdr_file = full_path(db_hdr_file) 3363 db_file_type = database.get_format() 3364 3365 # If db_file is http ? 3366 if database.get_database().startswith("http"): 3367 3368 # Datbase is HTTP URL 3369 db_file_is_http = True 3370 3371 # DB file keep as URL 3372 db_file = database.get_database() 3373 log.warning(f"Annotations 'bigwig' database '{db_file}' - is an HTTP URL (experimental)") 3374 3375 # Retrieve automatic annotation field name 3376 annotation_field = clean_annotation_field(os.path.basename(db_file).replace(".bw", "")) 3377 log.debug(f"Create header file with annotation field '{annotation_field}' is an HTTP URL") 3378 3379 # Create automatic header file 3380 db_hdr_file = os.path.join(tmp_dir, "header.hdr") 3381 with open(db_hdr_file, 'w') as f: 3382 f.write("##fileformat=VCFv4.2\n") 3383 f.write(f"""##INFO=<ID={annotation_field},Number=.,Type=Float,Description="{annotation_field} annotation from {db_file}">\n""") 3384 f.write(f"#CHROM START END {annotation_field}\n") 3385 3386 else: 3387 3388 # Datbase is NOT HTTP URL 3389 db_file_is_http = False 3390 3391 3392 # Check index - try to create if not exists 3393 if db_file is None or db_hdr_file is None or (not os.path.exists(db_file) and not db_file_is_http) or not os.path.exists(db_hdr_file) or not db_file_type in ["bw"]: 3394 #if False: 3395 log.error("Annotation failed: database not valid") 3396 log.error(f"Annotation annotation file: {db_file}") 3397 log.error(f"Annotation annotation file type: {db_file_type}") 3398 log.error(f"Annotation annotation header: {db_hdr_file}") 3399 raise ValueError( 3400 f"Annotation failed: database not valid - annotation file {db_file} / annotation file type {db_file_type} / annotation header {db_hdr_file}" 3401 ) 3402 else: 3403 3404 # Log 3405 log.debug( 3406 f"Annotation '{annotation}' - file: " 3407 + str(db_file) 3408 + " and " 3409 + str(db_hdr_file) 3410 ) 3411 3412 # Load header as VCF object 3413 db_hdr_vcf = Variants(input=db_hdr_file) 3414 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3415 log.debug( 3416 "Annotation database header: " 3417 + str(db_hdr_vcf_header_infos) 3418 ) 3419 3420 # For all fields in database 3421 annotation_fields_full = False 3422 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3423 annotation_fields = { 3424 key: key for key in db_hdr_vcf_header_infos 3425 } 3426 log.debug( 3427 "Annotation database header - All annotations added: " 3428 + str(annotation_fields) 3429 ) 3430 annotation_fields_full = True 3431 3432 # Init 3433 cyvcf2_header_rename_dict = {} 3434 cyvcf2_header_list = [] 3435 cyvcf2_header_indexes = {} 3436 3437 # process annotation fields 3438 for annotation_field in annotation_fields: 3439 3440 # New annotation name 3441 annotation_field_new = annotation_fields[annotation_field] 3442 3443 # Check annotation field and index in header 3444 if annotation_field in db_hdr_vcf.get_header_columns_as_list(): 3445 annotation_field_index = db_hdr_vcf.get_header_columns_as_list().index(annotation_field)-3 3446 cyvcf2_header_indexes[annotation_field_new] = annotation_field_index 3447 else: 3448 msg_err = f"Database '{db_file}' does NOT contain annotation field '{annotation_field}'" 3449 log.error(msg_err) 3450 raise ValueError(msg_err) 3451 3452 # Append annotation field in cyvcf2 header list 3453 cyvcf2_header_rename_dict[annotation_field_new] = db_hdr_vcf_header_infos[annotation_field].id 3454 cyvcf2_header_list.append( 3455 { 3456 "ID": annotation_field_new, 3457 "Number": db_hdr_vcf_header_infos[annotation_field].num, 3458 "Type": db_hdr_vcf_header_infos[annotation_field].type, 3459 "Description": db_hdr_vcf_header_infos[annotation_field].desc, 3460 } 3461 ) 3462 3463 # Load bigwig database 3464 bw_db = pyBigWig.open(db_file) 3465 if bw_db.isBigWig(): 3466 log.debug(f"Database '{db_file}' is in 'BigWig' format") 3467 else: 3468 msg_err = f"Database '{db_file}' is NOT in 'BigWig' format" 3469 log.error(msg_err) 3470 raise ValueError(msg_err) 3471 3472 annotation_bigwig_config_list.append( 3473 { 3474 "db_file": db_file, 3475 "bw_db": bw_db, 3476 "cyvcf2_header_rename_dict": cyvcf2_header_rename_dict, 3477 "cyvcf2_header_list": cyvcf2_header_list, 3478 "cyvcf2_header_indexes": cyvcf2_header_indexes 3479 } 3480 ) 3481 3482 # Annotate 3483 if annotation_bigwig_config_list: 3484 3485 # Annotation config 3486 log.debug(f"annotation_bigwig_config={annotation_bigwig_config_list}") 3487 3488 # Export VCF file 3489 self.export_variant_vcf( 3490 vcf_file=tmp_vcf_name, 3491 remove_info=True, 3492 add_samples=False, 3493 index=True, 3494 ) 3495 3496 # Load input tmp file 3497 input_vcf = cyvcf2.VCF(tmp_vcf_name) 3498 3499 # Add header in input file 3500 for annotation_bigwig_config in annotation_bigwig_config_list: 3501 for cyvcf2_header_field in annotation_bigwig_config.get("cyvcf2_header_list",[]): 3502 log.info(f"Annotations 'bigwig' database '{os.path.basename(annotation_bigwig_config.get('db_file'))}' - annotation field '{annotation_bigwig_config.get('cyvcf2_header_rename_dict',{}).get(cyvcf2_header_field.get('ID','Unknown'))}' -> '{cyvcf2_header_field.get('ID')}'") 3503 input_vcf.add_info_to_header( 3504 cyvcf2_header_field 3505 ) 3506 3507 # Create output VCF file 3508 output_vcf_file = os.path.join(tmp_dir,"output.vcf.gz") 3509 output_vcf = cyvcf2.Writer(output_vcf_file, input_vcf) 3510 3511 # Fetch variants 3512 log.info(f"Annotations 'bigwig' start...") 3513 for variant in input_vcf: 3514 3515 for annotation_bigwig_config in annotation_bigwig_config_list: 3516 3517 # DB and indexes 3518 bw_db = annotation_bigwig_config.get("bw_db", None) 3519 cyvcf2_header_indexes = annotation_bigwig_config.get("cyvcf2_header_indexes", None) 3520 3521 # Retrieve value from chrom pos 3522 res = bw_db.values(variant.CHROM, variant.POS - 1, variant.POS) 3523 3524 # For each annotation fields (and indexes) 3525 for cyvcf2_header_index in cyvcf2_header_indexes: 3526 3527 # If value is NOT nNone 3528 if not np.isnan(res[cyvcf2_header_indexes[cyvcf2_header_index]]): 3529 variant.INFO[cyvcf2_header_index] = res[cyvcf2_header_indexes[cyvcf2_header_index]] 3530 3531 # Add record in output file 3532 output_vcf.write_record(variant) 3533 3534 # Log 3535 log.debug(f"Annotation done.") 3536 3537 # Close and write file 3538 log.info(f"Annotations 'bigwig' write...") 3539 output_vcf.close() 3540 log.debug(f"Write done.") 3541 3542 # Update variants 3543 log.info(f"Annotations 'bigwig' update...") 3544 self.update_from_vcf(output_vcf_file) 3545 log.debug(f"Update done.") 3546 3547 return True
The function annotation_bigwig annotates variants in a VCF file using bigwig databases.
Parameters
- threads: The
threadsparameter in theannotation_bigwigmethod is used to specify the number of threads to be used for parallel processing during the annotation process. If thethreadsparameter is not provided, the method will attempt to determine the optimal number of threads to use based on the system configuration
Returns
True
3550 def annotation_snpsift(self, threads: int = None) -> None: 3551 """ 3552 This function annotate with bcftools 3553 3554 :param threads: Number of threads to use 3555 :return: the value of the variable "return_value". 3556 """ 3557 3558 # DEBUG 3559 log.debug("Start annotation with bcftools databases") 3560 3561 # Threads 3562 if not threads: 3563 threads = self.get_threads() 3564 log.debug("Threads: " + str(threads)) 3565 3566 # Config 3567 config = self.get_config() 3568 log.debug("Config: " + str(config)) 3569 3570 # Config - snpSift 3571 snpsift_bin_command = get_bin_command( 3572 bin="SnpSift.jar", 3573 tool="snpsift", 3574 bin_type="jar", 3575 config=config, 3576 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 3577 ) 3578 if not snpsift_bin_command: 3579 msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'" 3580 log.error(msg_err) 3581 raise ValueError(msg_err) 3582 3583 # Config - bcftools 3584 bcftools_bin_command = get_bin_command( 3585 bin="bcftools", 3586 tool="bcftools", 3587 bin_type="bin", 3588 config=config, 3589 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3590 ) 3591 if not bcftools_bin_command: 3592 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3593 log.error(msg_err) 3594 raise ValueError(msg_err) 3595 3596 # Config - BCFTools databases folders 3597 databases_folders = set( 3598 self.get_config() 3599 .get("folders", {}) 3600 .get("databases", {}) 3601 .get("annotations", ["."]) 3602 + self.get_config() 3603 .get("folders", {}) 3604 .get("databases", {}) 3605 .get("bcftools", ["."]) 3606 ) 3607 log.debug("Databases annotations: " + str(databases_folders)) 3608 3609 # Param 3610 annotations = ( 3611 self.get_param() 3612 .get("annotation", {}) 3613 .get("snpsift", {}) 3614 .get("annotations", None) 3615 ) 3616 log.debug("Annotations: " + str(annotations)) 3617 3618 # Assembly 3619 assembly = self.get_param().get( 3620 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3621 ) 3622 3623 # Data 3624 table_variants = self.get_table_variants() 3625 3626 # Check if not empty 3627 log.debug("Check if not empty") 3628 sql_query_chromosomes = ( 3629 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3630 ) 3631 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3632 if not sql_query_chromosomes_df["count"][0]: 3633 log.info(f"VCF empty") 3634 return 3635 3636 # VCF header 3637 vcf_reader = self.get_header() 3638 log.debug("Initial header: " + str(vcf_reader.infos)) 3639 3640 # Existing annotations 3641 for vcf_annotation in self.get_header().infos: 3642 3643 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 3644 log.debug( 3645 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 3646 ) 3647 3648 if annotations: 3649 3650 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 3651 3652 # Export VCF file 3653 tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz") 3654 3655 # Init 3656 commands = {} 3657 3658 for annotation in annotations: 3659 annotation_fields = annotations[annotation] 3660 3661 # Annotation Name 3662 annotation_name = os.path.basename(annotation) 3663 3664 if not annotation_fields: 3665 annotation_fields = {"INFO": None} 3666 3667 log.debug(f"Annotation '{annotation_name}'") 3668 log.debug( 3669 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 3670 ) 3671 3672 # Create Database 3673 database = Database( 3674 database=annotation, 3675 databases_folders=databases_folders, 3676 assembly=assembly, 3677 ) 3678 3679 # Find files 3680 db_file = database.get_database() 3681 db_file = full_path(db_file) 3682 db_hdr_file = database.get_header_file() 3683 db_hdr_file = full_path(db_hdr_file) 3684 db_file_type = database.get_format() 3685 db_tbi_file = f"{db_file}.tbi" 3686 db_file_compressed = database.is_compressed() 3687 3688 # Check if compressed 3689 if not db_file_compressed: 3690 log.error( 3691 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3692 ) 3693 raise ValueError( 3694 f"Annotation '{annotation}' - {db_file} NOT compressed file" 3695 ) 3696 3697 # Check if indexed 3698 if not os.path.exists(db_tbi_file): 3699 log.error( 3700 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3701 ) 3702 raise ValueError( 3703 f"Annotation '{annotation}' - {db_file} NOT indexed file" 3704 ) 3705 3706 # Check index - try to create if not exists 3707 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 3708 log.error("Annotation failed: database not valid") 3709 log.error(f"Annotation annotation file: {db_file}") 3710 log.error(f"Annotation annotation header: {db_hdr_file}") 3711 log.error(f"Annotation annotation index: {db_tbi_file}") 3712 raise ValueError( 3713 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 3714 ) 3715 else: 3716 3717 log.debug( 3718 f"Annotation '{annotation}' - file: " 3719 + str(db_file) 3720 + " and " 3721 + str(db_hdr_file) 3722 ) 3723 3724 # Load header as VCF object 3725 db_hdr_vcf = Variants(input=db_hdr_file) 3726 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 3727 log.debug( 3728 "Annotation database header: " 3729 + str(db_hdr_vcf_header_infos) 3730 ) 3731 3732 # For all fields in database 3733 annotation_fields_full = False 3734 if "ALL" in annotation_fields or "INFO" in annotation_fields: 3735 annotation_fields = { 3736 key: key for key in db_hdr_vcf_header_infos 3737 } 3738 log.debug( 3739 "Annotation database header - All annotations added: " 3740 + str(annotation_fields) 3741 ) 3742 annotation_fields_full = True 3743 3744 # # Create file for field rename 3745 # log.debug("Create file for field rename") 3746 # tmp_rename = NamedTemporaryFile( 3747 # prefix=self.get_prefix(), 3748 # dir=self.get_tmp_dir(), 3749 # suffix=".rename", 3750 # delete=False, 3751 # ) 3752 # tmp_rename_name = tmp_rename.name 3753 # tmp_files.append(tmp_rename_name) 3754 3755 # Number of fields 3756 nb_annotation_field = 0 3757 annotation_list = [] 3758 annotation_infos_rename_list = [] 3759 3760 for annotation_field in annotation_fields: 3761 3762 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 3763 annotation_fields_new_name = annotation_fields.get( 3764 annotation_field, annotation_field 3765 ) 3766 if not annotation_fields_new_name: 3767 annotation_fields_new_name = annotation_field 3768 3769 # Check if field is in DB and if field is not elready in input data 3770 if ( 3771 annotation_field in db_hdr_vcf.get_header().infos 3772 and annotation_fields_new_name 3773 not in self.get_header().infos 3774 ): 3775 3776 log.info( 3777 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 3778 ) 3779 3780 # BCFTools annotate param to rename fields 3781 if annotation_field != annotation_fields_new_name: 3782 annotation_infos_rename_list.append( 3783 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 3784 ) 3785 3786 # Add INFO field to header 3787 db_hdr_vcf_header_infos_number = ( 3788 db_hdr_vcf_header_infos[annotation_field].num or "." 3789 ) 3790 db_hdr_vcf_header_infos_type = ( 3791 db_hdr_vcf_header_infos[annotation_field].type 3792 or "String" 3793 ) 3794 db_hdr_vcf_header_infos_description = ( 3795 db_hdr_vcf_header_infos[annotation_field].desc 3796 or f"{annotation_field} description" 3797 ) 3798 db_hdr_vcf_header_infos_source = ( 3799 db_hdr_vcf_header_infos[annotation_field].source 3800 or "unknown" 3801 ) 3802 db_hdr_vcf_header_infos_version = ( 3803 db_hdr_vcf_header_infos[annotation_field].version 3804 or "unknown" 3805 ) 3806 3807 vcf_reader.infos[annotation_fields_new_name] = ( 3808 vcf.parser._Info( 3809 annotation_fields_new_name, 3810 db_hdr_vcf_header_infos_number, 3811 db_hdr_vcf_header_infos_type, 3812 db_hdr_vcf_header_infos_description, 3813 db_hdr_vcf_header_infos_source, 3814 db_hdr_vcf_header_infos_version, 3815 self.code_type_map[ 3816 db_hdr_vcf_header_infos_type 3817 ], 3818 ) 3819 ) 3820 3821 annotation_list.append(annotation_field) 3822 3823 nb_annotation_field += 1 3824 3825 else: 3826 3827 if ( 3828 annotation_field 3829 not in db_hdr_vcf.get_header().infos 3830 ): 3831 log.warning( 3832 f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file" 3833 ) 3834 if ( 3835 annotation_fields_new_name 3836 in self.get_header().infos 3837 ): 3838 log.warning( 3839 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)" 3840 ) 3841 3842 log.info( 3843 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 3844 ) 3845 3846 annotation_infos = ",".join(annotation_list) 3847 3848 if annotation_infos != "": 3849 3850 # Annotated VCF (and error file) 3851 tmp_annotation_vcf_name = os.path.join( 3852 tmp_dir, os.path.basename(annotation) + ".vcf.gz" 3853 ) 3854 tmp_annotation_vcf_name_err = ( 3855 tmp_annotation_vcf_name + ".err" 3856 ) 3857 3858 # Add fields to annotate 3859 if not annotation_fields_full: 3860 annotation_infos_option = f"-info {annotation_infos}" 3861 else: 3862 annotation_infos_option = "" 3863 3864 # Info fields rename 3865 if annotation_infos_rename_list: 3866 annotation_infos_rename = " -c " + ",".join( 3867 annotation_infos_rename_list 3868 ) 3869 else: 3870 annotation_infos_rename = "" 3871 3872 # Annotate command 3873 command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 3874 3875 # Add command 3876 commands[command_annotate] = tmp_annotation_vcf_name 3877 3878 if commands: 3879 3880 # Export VCF file 3881 self.export_variant_vcf( 3882 vcf_file=tmp_vcf_name, 3883 remove_info=True, 3884 add_samples=False, 3885 index=True, 3886 ) 3887 shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf") 3888 3889 # Num command 3890 nb_command = 0 3891 3892 # Annotate 3893 for command_annotate in commands: 3894 nb_command += 1 3895 log.info( 3896 f"Annotation - Annotate [{nb_command}/{len(commands)}]..." 3897 ) 3898 log.debug(f"command_annotate={command_annotate}") 3899 run_parallel_commands([command_annotate], threads) 3900 3901 # Debug 3902 shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf") 3903 3904 # Update variants 3905 log.info( 3906 f"Annotation - Updating [{nb_command}/{len(commands)}]..." 3907 ) 3908 self.update_from_vcf(commands[command_annotate])
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
3911 def annotation_bcftools(self, threads: int = None) -> None: 3912 """ 3913 This function annotate with bcftools 3914 3915 :param threads: Number of threads to use 3916 :return: the value of the variable "return_value". 3917 """ 3918 3919 # DEBUG 3920 log.debug("Start annotation with bcftools databases") 3921 3922 # Threads 3923 if not threads: 3924 threads = self.get_threads() 3925 log.debug("Threads: " + str(threads)) 3926 3927 # Config 3928 config = self.get_config() 3929 log.debug("Config: " + str(config)) 3930 3931 # DEBUG 3932 delete_tmp = True 3933 if self.get_config().get("verbosity", "warning") in ["debug"]: 3934 delete_tmp = False 3935 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 3936 3937 # Config - BCFTools bin command 3938 bcftools_bin_command = get_bin_command( 3939 bin="bcftools", 3940 tool="bcftools", 3941 bin_type="bin", 3942 config=config, 3943 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 3944 ) 3945 if not bcftools_bin_command: 3946 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 3947 log.error(msg_err) 3948 raise ValueError(msg_err) 3949 3950 # Config - BCFTools databases folders 3951 databases_folders = set( 3952 self.get_config() 3953 .get("folders", {}) 3954 .get("databases", {}) 3955 .get("annotations", ["."]) 3956 + self.get_config() 3957 .get("folders", {}) 3958 .get("databases", {}) 3959 .get("bcftools", ["."]) 3960 ) 3961 log.debug("Databases annotations: " + str(databases_folders)) 3962 3963 # Param 3964 annotations = ( 3965 self.get_param() 3966 .get("annotation", {}) 3967 .get("bcftools", {}) 3968 .get("annotations", None) 3969 ) 3970 log.debug("Annotations: " + str(annotations)) 3971 3972 # Assembly 3973 assembly = self.get_param().get( 3974 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 3975 ) 3976 3977 # Data 3978 table_variants = self.get_table_variants() 3979 3980 # Check if not empty 3981 log.debug("Check if not empty") 3982 sql_query_chromosomes = ( 3983 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 3984 ) 3985 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 3986 if not sql_query_chromosomes_df["count"][0]: 3987 log.info(f"VCF empty") 3988 return 3989 3990 # Export in VCF 3991 log.debug("Create initial file to annotate") 3992 tmp_vcf = NamedTemporaryFile( 3993 prefix=self.get_prefix(), 3994 dir=self.get_tmp_dir(), 3995 suffix=".vcf.gz", 3996 delete=False, 3997 ) 3998 tmp_vcf_name = tmp_vcf.name 3999 4000 # VCF header 4001 vcf_reader = self.get_header() 4002 log.debug("Initial header: " + str(vcf_reader.infos)) 4003 4004 # Existing annotations 4005 for vcf_annotation in self.get_header().infos: 4006 4007 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 4008 log.debug( 4009 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 4010 ) 4011 4012 if annotations: 4013 4014 tmp_ann_vcf_list = [] 4015 commands = [] 4016 tmp_files = [] 4017 err_files = [] 4018 4019 for annotation in annotations: 4020 annotation_fields = annotations[annotation] 4021 4022 # Annotation Name 4023 annotation_name = os.path.basename(annotation) 4024 4025 if not annotation_fields: 4026 annotation_fields = {"INFO": None} 4027 4028 log.debug(f"Annotation '{annotation_name}'") 4029 log.debug( 4030 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 4031 ) 4032 4033 # Create Database 4034 database = Database( 4035 database=annotation, 4036 databases_folders=databases_folders, 4037 assembly=assembly, 4038 ) 4039 4040 # Find files 4041 db_file = database.get_database() 4042 db_file = full_path(db_file) 4043 db_hdr_file = database.get_header_file() 4044 db_hdr_file = full_path(db_hdr_file) 4045 db_file_type = database.get_format() 4046 db_tbi_file = f"{db_file}.tbi" 4047 db_file_compressed = database.is_compressed() 4048 4049 # Check if compressed 4050 if not db_file_compressed: 4051 log.error( 4052 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4053 ) 4054 raise ValueError( 4055 f"Annotation '{annotation}' - {db_file} NOT compressed file" 4056 ) 4057 4058 # Check if indexed 4059 if not os.path.exists(db_tbi_file): 4060 log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file") 4061 raise ValueError( 4062 f"Annotation '{annotation}' - {db_file} NOT indexed file" 4063 ) 4064 4065 # Check index - try to create if not exists 4066 if not os.path.exists(db_file) or not os.path.exists(db_hdr_file): 4067 log.error("Annotation failed: database not valid") 4068 log.error(f"Annotation annotation file: {db_file}") 4069 log.error(f"Annotation annotation header: {db_hdr_file}") 4070 log.error(f"Annotation annotation index: {db_tbi_file}") 4071 raise ValueError( 4072 f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}" 4073 ) 4074 else: 4075 4076 log.debug( 4077 f"Annotation '{annotation}' - file: " 4078 + str(db_file) 4079 + " and " 4080 + str(db_hdr_file) 4081 ) 4082 4083 # Load header as VCF object 4084 db_hdr_vcf = Variants(input=db_hdr_file) 4085 db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos 4086 log.debug( 4087 "Annotation database header: " + str(db_hdr_vcf_header_infos) 4088 ) 4089 4090 # For all fields in database 4091 if "ALL" in annotation_fields or "INFO" in annotation_fields: 4092 annotation_fields = { 4093 key: key for key in db_hdr_vcf_header_infos 4094 } 4095 log.debug( 4096 "Annotation database header - All annotations added: " 4097 + str(annotation_fields) 4098 ) 4099 4100 # Number of fields 4101 nb_annotation_field = 0 4102 annotation_list = [] 4103 4104 for annotation_field in annotation_fields: 4105 4106 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 4107 annotation_fields_new_name = annotation_fields.get( 4108 annotation_field, annotation_field 4109 ) 4110 if not annotation_fields_new_name: 4111 annotation_fields_new_name = annotation_field 4112 4113 # Check if field is in DB and if field is not elready in input data 4114 if ( 4115 annotation_field in db_hdr_vcf.get_header().infos 4116 and annotation_fields_new_name 4117 not in self.get_header().infos 4118 ): 4119 4120 log.info( 4121 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'" 4122 ) 4123 4124 # Add INFO field to header 4125 db_hdr_vcf_header_infos_number = ( 4126 db_hdr_vcf_header_infos[annotation_field].num or "." 4127 ) 4128 db_hdr_vcf_header_infos_type = ( 4129 db_hdr_vcf_header_infos[annotation_field].type 4130 or "String" 4131 ) 4132 db_hdr_vcf_header_infos_description = ( 4133 db_hdr_vcf_header_infos[annotation_field].desc 4134 or f"{annotation_field} description" 4135 ) 4136 db_hdr_vcf_header_infos_source = ( 4137 db_hdr_vcf_header_infos[annotation_field].source 4138 or "unknown" 4139 ) 4140 db_hdr_vcf_header_infos_version = ( 4141 db_hdr_vcf_header_infos[annotation_field].version 4142 or "unknown" 4143 ) 4144 4145 vcf_reader.infos[annotation_fields_new_name] = ( 4146 vcf.parser._Info( 4147 annotation_fields_new_name, 4148 db_hdr_vcf_header_infos_number, 4149 db_hdr_vcf_header_infos_type, 4150 db_hdr_vcf_header_infos_description, 4151 db_hdr_vcf_header_infos_source, 4152 db_hdr_vcf_header_infos_version, 4153 self.code_type_map[db_hdr_vcf_header_infos_type], 4154 ) 4155 ) 4156 4157 # annotation_list.append(annotation_field) 4158 if annotation_field != annotation_fields_new_name: 4159 annotation_list.append( 4160 f"{annotation_fields_new_name}:=INFO/{annotation_field}" 4161 ) 4162 else: 4163 annotation_list.append(annotation_field) 4164 4165 nb_annotation_field += 1 4166 4167 else: 4168 4169 if annotation_field not in db_hdr_vcf.get_header().infos: 4170 log.warning( 4171 f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file" 4172 ) 4173 if annotation_fields_new_name in self.get_header().infos: 4174 log.warning( 4175 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 4176 ) 4177 4178 log.info( 4179 f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file" 4180 ) 4181 4182 annotation_infos = ",".join(annotation_list) 4183 4184 if annotation_infos != "": 4185 4186 # Protect header for bcftools (remove "#CHROM" and variants line) 4187 log.debug("Protect Header file - remove #CHROM line if exists") 4188 tmp_header_vcf = NamedTemporaryFile( 4189 prefix=self.get_prefix(), 4190 dir=self.get_tmp_dir(), 4191 suffix=".hdr", 4192 delete=False, 4193 ) 4194 tmp_header_vcf_name = tmp_header_vcf.name 4195 tmp_files.append(tmp_header_vcf_name) 4196 # Command 4197 if db_hdr_file.endswith(".gz"): 4198 command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4199 else: 4200 command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}" 4201 # Run 4202 run_parallel_commands([command_extract_header], 1) 4203 4204 # Find chomosomes 4205 log.debug("Find chromosomes ") 4206 sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\"""" 4207 sql_query_chromosomes_df = self.get_query_to_df( 4208 sql_query_chromosomes 4209 ) 4210 chomosomes_list = list(sql_query_chromosomes_df["CHROM"]) 4211 4212 log.debug("Chromosomes found: " + str(list(chomosomes_list))) 4213 4214 # BED columns in the annotation file 4215 if db_file_type in ["bed"]: 4216 annotation_infos = "CHROM,POS,POS," + annotation_infos 4217 4218 for chrom in chomosomes_list: 4219 4220 # Create BED on initial VCF 4221 log.debug("Create BED on initial VCF: " + str(tmp_vcf_name)) 4222 tmp_bed = NamedTemporaryFile( 4223 prefix=self.get_prefix(), 4224 dir=self.get_tmp_dir(), 4225 suffix=".bed", 4226 delete=False, 4227 ) 4228 tmp_bed_name = tmp_bed.name 4229 tmp_files.append(tmp_bed_name) 4230 4231 # Detecte regions 4232 log.debug( 4233 f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..." 4234 ) 4235 window = 1000000 4236 sql_query_intervals_for_bed = f""" 4237 SELECT \"#CHROM\", 4238 CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END, 4239 \"POS\"+{window} 4240 FROM {table_variants} as table_variants 4241 WHERE table_variants.\"#CHROM\" = '{chrom}' 4242 """ 4243 regions = self.conn.execute( 4244 sql_query_intervals_for_bed 4245 ).fetchall() 4246 merged_regions = merge_regions(regions) 4247 log.debug( 4248 f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..." 4249 ) 4250 4251 header = ["#CHROM", "START", "END"] 4252 with open(tmp_bed_name, "w") as f: 4253 # Write the header with tab delimiter 4254 f.write("\t".join(header) + "\n") 4255 for d in merged_regions: 4256 # Write each data row with tab delimiter 4257 f.write("\t".join(map(str, d)) + "\n") 4258 4259 # Tmp files 4260 tmp_annotation_vcf = NamedTemporaryFile( 4261 prefix=self.get_prefix(), 4262 dir=self.get_tmp_dir(), 4263 suffix=".vcf.gz", 4264 delete=False, 4265 ) 4266 tmp_annotation_vcf_name = tmp_annotation_vcf.name 4267 tmp_files.append(tmp_annotation_vcf_name) 4268 tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}") 4269 tmp_annotation_vcf_name_err = ( 4270 tmp_annotation_vcf_name + ".err" 4271 ) 4272 err_files.append(tmp_annotation_vcf_name_err) 4273 4274 # Annotate Command 4275 log.debug( 4276 f"Annotation '{annotation}' - add bcftools command" 4277 ) 4278 4279 # Command 4280 command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} " 4281 4282 # Add command 4283 commands.append(command_annotate) 4284 4285 # if some commands 4286 if commands: 4287 4288 # Export VCF file 4289 self.export_variant_vcf( 4290 vcf_file=tmp_vcf_name, 4291 remove_info=True, 4292 add_samples=False, 4293 index=True, 4294 ) 4295 4296 # Threads 4297 # calculate threads for annotated commands 4298 if commands: 4299 threads_bcftools_annotate = round(threads / len(commands)) 4300 else: 4301 threads_bcftools_annotate = 1 4302 4303 if not threads_bcftools_annotate: 4304 threads_bcftools_annotate = 1 4305 4306 # Add threads option to bcftools commands 4307 if threads_bcftools_annotate > 1: 4308 commands_threaded = [] 4309 for command in commands: 4310 commands_threaded.append( 4311 command.replace( 4312 f"{bcftools_bin_command} annotate ", 4313 f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ", 4314 ) 4315 ) 4316 commands = commands_threaded 4317 4318 # Command annotation multithreading 4319 log.debug(f"Annotation - Annotation commands: " + str(commands)) 4320 log.info( 4321 f"Annotation - Annotation multithreaded in " 4322 + str(len(commands)) 4323 + " commands" 4324 ) 4325 4326 run_parallel_commands(commands, threads) 4327 4328 # Merge 4329 tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list) 4330 4331 if tmp_ann_vcf_list_cmd: 4332 4333 # Tmp file 4334 tmp_annotate_vcf = NamedTemporaryFile( 4335 prefix=self.get_prefix(), 4336 dir=self.get_tmp_dir(), 4337 suffix=".vcf.gz", 4338 delete=True, 4339 ) 4340 tmp_annotate_vcf_name = tmp_annotate_vcf.name 4341 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 4342 err_files.append(tmp_annotate_vcf_name_err) 4343 4344 # Tmp file remove command 4345 tmp_files_remove_command = "" 4346 if tmp_files: 4347 tmp_files_remove_command = " && rm -f " + " ".join(tmp_files) 4348 4349 # Command merge 4350 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}" 4351 log.info( 4352 f"Annotation - Annotation merging " 4353 + str(len(commands)) 4354 + " annotated files" 4355 ) 4356 log.debug(f"Annotation - merge command: {merge_command}") 4357 run_parallel_commands([merge_command], 1) 4358 4359 # Error messages 4360 log.info(f"Error/Warning messages:") 4361 error_message_command_all = [] 4362 error_message_command_warning = [] 4363 error_message_command_err = [] 4364 for err_file in err_files: 4365 with open(err_file, "r") as f: 4366 for line in f: 4367 message = line.strip() 4368 error_message_command_all.append(message) 4369 if line.startswith("[W::"): 4370 error_message_command_warning.append(message) 4371 if line.startswith("[E::"): 4372 error_message_command_err.append( 4373 f"{err_file}: " + message 4374 ) 4375 # log info 4376 for message in list( 4377 set(error_message_command_err + error_message_command_warning) 4378 ): 4379 log.info(f" {message}") 4380 # debug info 4381 for message in list(set(error_message_command_all)): 4382 log.debug(f" {message}") 4383 # failed 4384 if len(error_message_command_err): 4385 log.error("Annotation failed: Error in commands") 4386 raise ValueError("Annotation failed: Error in commands") 4387 4388 # Update variants 4389 log.info(f"Annotation - Updating...") 4390 self.update_from_vcf(tmp_annotate_vcf_name)
This function annotate with bcftools
Parameters
- threads: Number of threads to use
Returns
the value of the variable "return_value".
4392 def annotation_exomiser(self, threads: int = None) -> None: 4393 """ 4394 This function annotate with Exomiser 4395 4396 This function uses args as parameters, in section "annotation" -> "exomiser", with sections: 4397 - "analysis" (dict/file): 4398 Full analysis dictionnary parameters (see Exomiser docs). 4399 Either a dict, or a file in JSON or YAML format. 4400 These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) 4401 Default : None 4402 - "preset" (string): 4403 Analysis preset (available in config folder). 4404 Used if no full "analysis" is provided. 4405 Default: "exome" 4406 - "phenopacket" (dict/file): 4407 Samples and phenotipic features parameters (see Exomiser docs). 4408 Either a dict, or a file in JSON or YAML format. 4409 Default: None 4410 - "subject" (dict): 4411 Sample parameters (see Exomiser docs). 4412 Example: 4413 "subject": 4414 { 4415 "id": "ISDBM322017", 4416 "sex": "FEMALE" 4417 } 4418 Default: None 4419 - "sample" (string): 4420 Sample name to construct "subject" section: 4421 "subject": 4422 { 4423 "id": "<sample>", 4424 "sex": "UNKNOWN_SEX" 4425 } 4426 Default: None 4427 - "phenotypicFeatures" (dict) 4428 Phenotypic features to construct "subject" section. 4429 Example: 4430 "phenotypicFeatures": 4431 [ 4432 { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, 4433 { "type": { "id": "HP:0000486", "label": "Strabismus" } } 4434 ] 4435 - "hpo" (list) 4436 List of HPO ids as phenotypic features. 4437 Example: 4438 "hpo": ['0001156', '0001363', '0011304', '0010055'] 4439 Default: [] 4440 - "outputOptions" (dict): 4441 Output options (see Exomiser docs). 4442 Default: 4443 "output_options" = 4444 { 4445 "outputContributingVariantsOnly": False, 4446 "numGenes": 0, 4447 "outputFormats": ["TSV_VARIANT", "VCF"] 4448 } 4449 - "transcript_source" (string): 4450 Transcript source (either "refseq", "ucsc", "ensembl") 4451 Default: "refseq" 4452 - "exomiser_to_info" (boolean): 4453 Add exomiser TSV file columns as INFO fields in VCF. 4454 Default: False 4455 - "release" (string): 4456 Exomise database release. 4457 If not exists, database release will be downloaded (take a while). 4458 Default: None (provided by application.properties configuration file) 4459 - "exomiser_application_properties" (file): 4460 Exomiser configuration file (see Exomiser docs). 4461 Useful to automatically download databases (especially for specific genome databases). 4462 4463 Notes: 4464 - If no sample in parameters, first sample in VCF will be chosen 4465 - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off 4466 4467 :param threads: The number of threads to use 4468 :return: None. 4469 """ 4470 4471 # DEBUG 4472 log.debug("Start annotation with Exomiser databases") 4473 4474 # Threads 4475 if not threads: 4476 threads = self.get_threads() 4477 log.debug("Threads: " + str(threads)) 4478 4479 # Config 4480 config = self.get_config() 4481 log.debug("Config: " + str(config)) 4482 4483 # Config - Folders - Databases 4484 databases_folders = ( 4485 config.get("folders", {}) 4486 .get("databases", {}) 4487 .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current") 4488 ) 4489 databases_folders = full_path(databases_folders) 4490 if not os.path.exists(databases_folders): 4491 log.error(f"Databases annotations: {databases_folders} NOT found") 4492 log.debug("Databases annotations: " + str(databases_folders)) 4493 4494 # Config - Exomiser 4495 exomiser_bin_command = get_bin_command( 4496 bin="exomiser-cli*.jar", 4497 tool="exomiser", 4498 bin_type="jar", 4499 config=config, 4500 default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser", 4501 ) 4502 log.debug("Exomiser bin command: " + str(exomiser_bin_command)) 4503 if not exomiser_bin_command: 4504 msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'" 4505 log.error(msg_err) 4506 raise ValueError(msg_err) 4507 4508 # Param 4509 param = self.get_param() 4510 log.debug("Param: " + str(param)) 4511 4512 # Param - Exomiser 4513 param_exomiser = param.get("annotation", {}).get("exomiser", {}) 4514 log.debug(f"Param Exomiser: {param_exomiser}") 4515 4516 # Param - Assembly 4517 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 4518 log.debug("Assembly: " + str(assembly)) 4519 4520 # Data 4521 table_variants = self.get_table_variants() 4522 4523 # Check if not empty 4524 log.debug("Check if not empty") 4525 sql_query_chromosomes = ( 4526 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 4527 ) 4528 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 4529 log.info(f"VCF empty") 4530 return False 4531 4532 # VCF header 4533 vcf_reader = self.get_header() 4534 log.debug("Initial header: " + str(vcf_reader.infos)) 4535 4536 # Samples 4537 samples = self.get_header_sample_list() 4538 if not samples: 4539 log.error("No Samples in VCF") 4540 return False 4541 log.debug(f"Samples: {samples}") 4542 4543 # Memory limit 4544 memory_limit = self.get_memory("8G") 4545 log.debug(f"memory_limit: {memory_limit}") 4546 4547 # Exomiser java options 4548 exomiser_java_options = ( 4549 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 4550 ) 4551 log.debug(f"Exomiser java options: {exomiser_java_options}") 4552 4553 # Download Exomiser (if not exists) 4554 exomiser_release = param_exomiser.get("release", None) 4555 exomiser_application_properties = param_exomiser.get( 4556 "exomiser_application_properties", None 4557 ) 4558 databases_download_exomiser( 4559 assemblies=[assembly], 4560 exomiser_folder=databases_folders, 4561 exomiser_release=exomiser_release, 4562 exomiser_phenotype_release=exomiser_release, 4563 exomiser_application_properties=exomiser_application_properties, 4564 ) 4565 4566 # Force annotation 4567 force_update_annotation = True 4568 4569 if "Exomiser" not in self.get_header().infos or force_update_annotation: 4570 log.debug("Start annotation Exomiser") 4571 4572 with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir: 4573 4574 # tmp_dir = "/tmp/exomiser" 4575 4576 ### ANALYSIS ### 4577 ################ 4578 4579 # Create analysis.json through analysis dict 4580 # either analysis in param or by default 4581 # depending on preset exome/genome) 4582 4583 # Init analysis dict 4584 param_exomiser_analysis_dict = {} 4585 4586 # analysis from param 4587 param_exomiser_analysis = param_exomiser.get("analysis", {}) 4588 param_exomiser_analysis = full_path(param_exomiser_analysis) 4589 4590 # If analysis in param -> load anlaysis json 4591 if param_exomiser_analysis: 4592 4593 # If param analysis is a file and exists 4594 if isinstance(param_exomiser_analysis, str) and os.path.exists( 4595 param_exomiser_analysis 4596 ): 4597 # Load analysis file into analysis dict (either yaml or json) 4598 with open(param_exomiser_analysis) as json_file: 4599 param_exomiser_analysis_dict = yaml.safe_load(json_file) 4600 4601 # If param analysis is a dict 4602 elif isinstance(param_exomiser_analysis, dict): 4603 # Load analysis dict into analysis dict (either yaml or json) 4604 param_exomiser_analysis_dict = param_exomiser_analysis 4605 4606 # Error analysis type 4607 else: 4608 log.error(f"Analysis type unknown. Check param file.") 4609 raise ValueError(f"Analysis type unknown. Check param file.") 4610 4611 # Case no input analysis config file/dict 4612 # Use preset (exome/genome) to open default config file 4613 if not param_exomiser_analysis_dict: 4614 4615 # default preset 4616 default_preset = "exome" 4617 4618 # Get param preset or default preset 4619 param_exomiser_preset = param_exomiser.get("preset", default_preset) 4620 4621 # Try to find if preset is a file 4622 if os.path.exists(param_exomiser_preset): 4623 # Preset file is provided in full path 4624 param_exomiser_analysis_default_config_file = ( 4625 param_exomiser_preset 4626 ) 4627 # elif os.path.exists(full_path(param_exomiser_preset)): 4628 # # Preset file is provided in full path 4629 # param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset) 4630 elif os.path.exists( 4631 os.path.join(folder_config, param_exomiser_preset) 4632 ): 4633 # Preset file is provided a basename in config folder (can be a path with subfolders) 4634 param_exomiser_analysis_default_config_file = os.path.join( 4635 folder_config, param_exomiser_preset 4636 ) 4637 else: 4638 # Construct preset file 4639 param_exomiser_analysis_default_config_file = os.path.join( 4640 folder_config, 4641 f"preset-{param_exomiser_preset}-analysis.json", 4642 ) 4643 4644 # If preset file exists 4645 param_exomiser_analysis_default_config_file = full_path( 4646 param_exomiser_analysis_default_config_file 4647 ) 4648 if os.path.exists(param_exomiser_analysis_default_config_file): 4649 # Load prest file into analysis dict (either yaml or json) 4650 with open( 4651 param_exomiser_analysis_default_config_file 4652 ) as json_file: 4653 # param_exomiser_analysis_dict[""] = json.load(json_file) 4654 param_exomiser_analysis_dict["analysis"] = yaml.safe_load( 4655 json_file 4656 ) 4657 4658 # Error preset file 4659 else: 4660 log.error( 4661 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4662 ) 4663 raise ValueError( 4664 f"No analysis preset config file ({param_exomiser_analysis_default_config_file})" 4665 ) 4666 4667 # If no analysis dict created 4668 if not param_exomiser_analysis_dict: 4669 log.error(f"No analysis config") 4670 raise ValueError(f"No analysis config") 4671 4672 # Log 4673 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4674 4675 ### PHENOPACKET ### 4676 ################### 4677 4678 # If no PhenoPacket in analysis dict -> check in param 4679 if "phenopacket" not in param_exomiser_analysis_dict: 4680 4681 # If PhenoPacket in param -> load anlaysis json 4682 if param_exomiser.get("phenopacket", None): 4683 4684 param_exomiser_phenopacket = param_exomiser.get("phenopacket") 4685 param_exomiser_phenopacket = full_path( 4686 param_exomiser_phenopacket 4687 ) 4688 4689 # If param phenopacket is a file and exists 4690 if isinstance( 4691 param_exomiser_phenopacket, str 4692 ) and os.path.exists(param_exomiser_phenopacket): 4693 # Load phenopacket file into analysis dict (either yaml or json) 4694 with open(param_exomiser_phenopacket) as json_file: 4695 param_exomiser_analysis_dict["phenopacket"] = ( 4696 yaml.safe_load(json_file) 4697 ) 4698 4699 # If param phenopacket is a dict 4700 elif isinstance(param_exomiser_phenopacket, dict): 4701 # Load phenopacket dict into analysis dict (either yaml or json) 4702 param_exomiser_analysis_dict["phenopacket"] = ( 4703 param_exomiser_phenopacket 4704 ) 4705 4706 # Error phenopacket type 4707 else: 4708 log.error(f"Phenopacket type unknown. Check param file.") 4709 raise ValueError( 4710 f"Phenopacket type unknown. Check param file." 4711 ) 4712 4713 # If no PhenoPacket in analysis dict -> construct from sample and HPO in param 4714 if "phenopacket" not in param_exomiser_analysis_dict: 4715 4716 # Init PhenoPacket 4717 param_exomiser_analysis_dict["phenopacket"] = { 4718 "id": "analysis", 4719 "proband": {}, 4720 } 4721 4722 ### Add subject ### 4723 4724 # If subject exists 4725 param_exomiser_subject = param_exomiser.get("subject", {}) 4726 4727 # If subject not exists -> found sample ID 4728 if not param_exomiser_subject: 4729 4730 # Found sample ID in param 4731 sample = param_exomiser.get("sample", None) 4732 4733 # Find sample ID (first sample) 4734 if not sample: 4735 sample_list = self.get_header_sample_list() 4736 if len(sample_list) > 0: 4737 sample = sample_list[0] 4738 else: 4739 log.error(f"No sample found") 4740 raise ValueError(f"No sample found") 4741 4742 # Create subject 4743 param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"} 4744 4745 # Add to dict 4746 param_exomiser_analysis_dict["phenopacket"][ 4747 "subject" 4748 ] = param_exomiser_subject 4749 4750 ### Add "phenotypicFeatures" ### 4751 4752 # If phenotypicFeatures exists 4753 param_exomiser_phenotypicfeatures = param_exomiser.get( 4754 "phenotypicFeatures", [] 4755 ) 4756 4757 # If phenotypicFeatures not exists -> Try to infer from hpo list 4758 if not param_exomiser_phenotypicfeatures: 4759 4760 # Found HPO in param 4761 param_exomiser_hpo = param_exomiser.get("hpo", []) 4762 4763 # Split HPO if list in string format separated by comma 4764 if isinstance(param_exomiser_hpo, str): 4765 param_exomiser_hpo = param_exomiser_hpo.split(",") 4766 4767 # Create HPO list 4768 for hpo in param_exomiser_hpo: 4769 hpo_clean = re.sub("[^0-9]", "", hpo) 4770 param_exomiser_phenotypicfeatures.append( 4771 { 4772 "type": { 4773 "id": f"HP:{hpo_clean}", 4774 "label": f"HP:{hpo_clean}", 4775 } 4776 } 4777 ) 4778 4779 # Add to dict 4780 param_exomiser_analysis_dict["phenopacket"][ 4781 "phenotypicFeatures" 4782 ] = param_exomiser_phenotypicfeatures 4783 4784 # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step 4785 if not param_exomiser_phenotypicfeatures: 4786 for step in param_exomiser_analysis_dict.get( 4787 "analysis", {} 4788 ).get("steps", []): 4789 if "hiPhivePrioritiser" in step: 4790 param_exomiser_analysis_dict.get("analysis", {}).get( 4791 "steps", [] 4792 ).remove(step) 4793 4794 ### Add Input File ### 4795 4796 # Initial file name and htsFiles 4797 tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz") 4798 param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [ 4799 { 4800 "uri": tmp_vcf_name, 4801 "htsFormat": "VCF", 4802 "genomeAssembly": assembly, 4803 } 4804 ] 4805 4806 ### Add metaData ### 4807 4808 # If metaData not in analysis dict 4809 if "metaData" not in param_exomiser_analysis_dict: 4810 param_exomiser_analysis_dict["phenopacket"]["metaData"] = { 4811 "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z", 4812 "createdBy": "howard", 4813 "phenopacketSchemaVersion": 1, 4814 } 4815 4816 ### OutputOptions ### 4817 4818 # Init output result folder 4819 output_results = os.path.join(tmp_dir, "results") 4820 4821 # If no outputOptions in analysis dict 4822 if "outputOptions" not in param_exomiser_analysis_dict: 4823 4824 # default output formats 4825 defaut_output_formats = ["TSV_VARIANT", "VCF"] 4826 4827 # Get outputOptions in param 4828 output_options = param_exomiser.get("outputOptions", None) 4829 4830 # If no output_options in param -> check 4831 if not output_options: 4832 output_options = { 4833 "outputContributingVariantsOnly": False, 4834 "numGenes": 0, 4835 "outputFormats": defaut_output_formats, 4836 } 4837 4838 # Replace outputDirectory in output options 4839 output_options["outputDirectory"] = output_results 4840 output_options["outputFileName"] = "howard" 4841 4842 # Add outputOptions in analysis dict 4843 param_exomiser_analysis_dict["outputOptions"] = output_options 4844 4845 else: 4846 4847 # Replace output_results and output format (if exists in param) 4848 param_exomiser_analysis_dict["outputOptions"][ 4849 "outputDirectory" 4850 ] = output_results 4851 param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = ( 4852 list( 4853 set( 4854 param_exomiser_analysis_dict.get( 4855 "outputOptions", {} 4856 ).get("outputFormats", []) 4857 + ["TSV_VARIANT", "VCF"] 4858 ) 4859 ) 4860 ) 4861 4862 # log 4863 log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}") 4864 4865 ### ANALYSIS FILE ### 4866 ##################### 4867 4868 ### Full JSON analysis config file ### 4869 4870 exomiser_analysis = os.path.join(tmp_dir, "analysis.json") 4871 with open(exomiser_analysis, "w") as fp: 4872 json.dump(param_exomiser_analysis_dict, fp, indent=4) 4873 4874 ### SPLIT analysis and sample config files 4875 4876 # Splitted analysis dict 4877 param_exomiser_analysis_dict_for_split = ( 4878 param_exomiser_analysis_dict.copy() 4879 ) 4880 4881 # Phenopacket JSON file 4882 exomiser_analysis_phenopacket = os.path.join( 4883 tmp_dir, "analysis_phenopacket.json" 4884 ) 4885 with open(exomiser_analysis_phenopacket, "w") as fp: 4886 json.dump( 4887 param_exomiser_analysis_dict_for_split.get("phenopacket"), 4888 fp, 4889 indent=4, 4890 ) 4891 4892 # Analysis JSON file without Phenopacket parameters 4893 param_exomiser_analysis_dict_for_split.pop("phenopacket") 4894 exomiser_analysis_analysis = os.path.join( 4895 tmp_dir, "analysis_analysis.json" 4896 ) 4897 with open(exomiser_analysis_analysis, "w") as fp: 4898 json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4) 4899 4900 ### INITAL VCF file ### 4901 ####################### 4902 4903 ### Create list of samples to use and include inti initial VCF file #### 4904 4905 # Subject (main sample) 4906 # Get sample ID in analysis dict 4907 sample_subject = ( 4908 param_exomiser_analysis_dict.get("phenopacket", {}) 4909 .get("subject", {}) 4910 .get("id", None) 4911 ) 4912 sample_proband = ( 4913 param_exomiser_analysis_dict.get("phenopacket", {}) 4914 .get("proband", {}) 4915 .get("subject", {}) 4916 .get("id", None) 4917 ) 4918 sample = [] 4919 if sample_subject: 4920 sample.append(sample_subject) 4921 if sample_proband: 4922 sample.append(sample_proband) 4923 4924 # Get sample ID within Pedigree 4925 pedigree_persons_list = ( 4926 param_exomiser_analysis_dict.get("phenopacket", {}) 4927 .get("pedigree", {}) 4928 .get("persons", {}) 4929 ) 4930 4931 # Create list with all sample ID in pedigree (if exists) 4932 pedigree_persons = [] 4933 for person in pedigree_persons_list: 4934 pedigree_persons.append(person.get("individualId")) 4935 4936 # Concat subject sample ID and samples ID in pedigreesamples 4937 samples = list(set(sample + pedigree_persons)) 4938 4939 # Check if sample list is not empty 4940 if not samples: 4941 log.error(f"No samples found") 4942 raise ValueError(f"No samples found") 4943 4944 # Create VCF with sample (either sample in param or first one by default) 4945 # Export VCF file 4946 self.export_variant_vcf( 4947 vcf_file=tmp_vcf_name, 4948 remove_info=True, 4949 add_samples=True, 4950 list_samples=samples, 4951 index=False, 4952 ) 4953 4954 ### Execute Exomiser ### 4955 ######################## 4956 4957 # Init command 4958 exomiser_command = "" 4959 4960 # Command exomiser options 4961 exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} " 4962 4963 # Release 4964 exomiser_release = param_exomiser.get("release", None) 4965 if exomiser_release: 4966 # phenotype data version 4967 exomiser_options += ( 4968 f" --exomiser.phenotype.data-version={exomiser_release} " 4969 ) 4970 # data version 4971 exomiser_options += ( 4972 f" --exomiser.{assembly}.data-version={exomiser_release} " 4973 ) 4974 # variant white list 4975 variant_white_list_file = ( 4976 f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz" 4977 ) 4978 if os.path.exists( 4979 os.path.join( 4980 databases_folders, assembly, variant_white_list_file 4981 ) 4982 ): 4983 exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} " 4984 4985 # transcript_source 4986 transcript_source = param_exomiser.get( 4987 "transcript_source", None 4988 ) # ucsc, refseq, ensembl 4989 if transcript_source: 4990 exomiser_options += ( 4991 f" --exomiser.{assembly}.transcript-source={transcript_source} " 4992 ) 4993 4994 # If analysis contain proband param 4995 if param_exomiser_analysis_dict.get("phenopacket", {}).get( 4996 "proband", {} 4997 ): 4998 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} " 4999 5000 # If no proband (usually uniq sample) 5001 else: 5002 exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}" 5003 5004 # Log 5005 log.debug(f"exomiser_command_analysis={exomiser_command_analysis}") 5006 5007 # Run command 5008 result = subprocess.call( 5009 exomiser_command_analysis.split(), stdout=subprocess.PIPE 5010 ) 5011 if result: 5012 log.error("Exomiser command failed") 5013 raise ValueError("Exomiser command failed") 5014 5015 ### RESULTS ### 5016 ############### 5017 5018 ### Annotate with TSV fields ### 5019 5020 # Init result tsv file 5021 exomiser_to_info = param_exomiser.get("exomiser_to_info", False) 5022 5023 # Init result tsv file 5024 output_results_tsv = os.path.join(output_results, "howard.variants.tsv") 5025 5026 # Parse TSV file and explode columns in INFO field 5027 if exomiser_to_info and os.path.exists(output_results_tsv): 5028 5029 # Log 5030 log.debug("Exomiser columns to VCF INFO field") 5031 5032 # Retrieve columns and types 5033 query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """ 5034 output_results_tsv_df = self.get_query_to_df(query) 5035 output_results_tsv_columns = output_results_tsv_df.columns.tolist() 5036 5037 # Init concat fields for update 5038 sql_query_update_concat_fields = [] 5039 5040 # Fields to avoid 5041 fields_to_avoid = [ 5042 "CONTIG", 5043 "START", 5044 "END", 5045 "REF", 5046 "ALT", 5047 "QUAL", 5048 "FILTER", 5049 "GENOTYPE", 5050 ] 5051 5052 # List all columns to add into header 5053 for header_column in output_results_tsv_columns: 5054 5055 # If header column is enable 5056 if header_column not in fields_to_avoid: 5057 5058 # Header info type 5059 header_info_type = "String" 5060 header_column_df = output_results_tsv_df[header_column] 5061 header_column_df_dtype = header_column_df.dtype 5062 if header_column_df_dtype == object: 5063 if ( 5064 pd.to_numeric(header_column_df, errors="coerce") 5065 .notnull() 5066 .all() 5067 ): 5068 header_info_type = "Float" 5069 else: 5070 header_info_type = "Integer" 5071 5072 # Header info 5073 characters_to_validate = ["-"] 5074 pattern = "[" + "".join(characters_to_validate) + "]" 5075 header_info_name = re.sub( 5076 pattern, 5077 "_", 5078 f"Exomiser_{header_column}".replace("#", ""), 5079 ) 5080 header_info_number = "." 5081 header_info_description = ( 5082 f"Exomiser {header_column} annotation" 5083 ) 5084 header_info_source = "Exomiser" 5085 header_info_version = "unknown" 5086 header_info_code = CODE_TYPE_MAP[header_info_type] 5087 vcf_reader.infos[header_info_name] = vcf.parser._Info( 5088 header_info_name, 5089 header_info_number, 5090 header_info_type, 5091 header_info_description, 5092 header_info_source, 5093 header_info_version, 5094 header_info_code, 5095 ) 5096 5097 # Add field to add for update to concat fields 5098 sql_query_update_concat_fields.append( 5099 f""" 5100 CASE 5101 WHEN table_parquet."{header_column}" NOT IN ('','.') 5102 THEN concat( 5103 '{header_info_name}=', 5104 table_parquet."{header_column}", 5105 ';' 5106 ) 5107 5108 ELSE '' 5109 END 5110 """ 5111 ) 5112 5113 # Update query 5114 sql_query_update = f""" 5115 UPDATE {table_variants} as table_variants 5116 SET INFO = concat( 5117 CASE 5118 WHEN INFO NOT IN ('', '.') 5119 THEN INFO 5120 ELSE '' 5121 END, 5122 CASE 5123 WHEN table_variants.INFO NOT IN ('','.') 5124 THEN ';' 5125 ELSE '' 5126 END, 5127 ( 5128 SELECT 5129 concat( 5130 {",".join(sql_query_update_concat_fields)} 5131 ) 5132 FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet 5133 WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\" 5134 AND table_parquet.\"START\" = table_variants.\"POS\" 5135 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 5136 AND table_parquet.\"REF\" = table_variants.\"REF\" 5137 ) 5138 ) 5139 ; 5140 """ 5141 5142 # Update 5143 self.conn.execute(sql_query_update) 5144 5145 ### Annotate with VCF INFO field ### 5146 5147 # Init result VCF file 5148 output_results_vcf = os.path.join(output_results, "howard.vcf.gz") 5149 5150 # If VCF exists 5151 if os.path.exists(output_results_vcf): 5152 5153 # Log 5154 log.debug("Exomiser result VCF update variants") 5155 5156 # Find Exomiser INFO field annotation in header 5157 with gzip.open(output_results_vcf, "rt") as f: 5158 header_list = self.read_vcf_header(f) 5159 exomiser_vcf_header = vcf.Reader( 5160 io.StringIO("\n".join(header_list)) 5161 ) 5162 5163 # Add annotation INFO field to header 5164 vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"] 5165 5166 # Update variants with VCF 5167 self.update_from_vcf(output_results_vcf) 5168 5169 return True
This function annotate with Exomiser
This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
- "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
- "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
- "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
- "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
- "sample" (string):
Sample name to construct "subject" section:
"subject":
{
"id": "
", "sex": "UNKNOWN_SEX" } Default: None - "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
- "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
- "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
- "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
- "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
- "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
- "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).
Notes:
- If no sample in parameters, first sample in VCF will be chosen
- If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
- threads: The number of threads to use
Returns
None.
5171 def annotation_snpeff(self, threads: int = None) -> None: 5172 """ 5173 This function annotate with snpEff 5174 5175 :param threads: The number of threads to use 5176 :return: the value of the variable "return_value". 5177 """ 5178 5179 # DEBUG 5180 log.debug("Start annotation with snpeff databases") 5181 5182 # Threads 5183 if not threads: 5184 threads = self.get_threads() 5185 log.debug("Threads: " + str(threads)) 5186 5187 # DEBUG 5188 delete_tmp = True 5189 if self.get_config().get("verbosity", "warning") in ["debug"]: 5190 delete_tmp = False 5191 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5192 5193 # Config 5194 config = self.get_config() 5195 log.debug("Config: " + str(config)) 5196 5197 # Config - Folders - Databases 5198 databases_folders = ( 5199 config.get("folders", {}).get("databases", {}).get("snpeff", ["."]) 5200 ) 5201 log.debug("Databases annotations: " + str(databases_folders)) 5202 5203 # Config - snpEff bin command 5204 snpeff_bin_command = get_bin_command( 5205 bin="snpEff.jar", 5206 tool="snpeff", 5207 bin_type="jar", 5208 config=config, 5209 default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff", 5210 ) 5211 if not snpeff_bin_command: 5212 msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'" 5213 log.error(msg_err) 5214 raise ValueError(msg_err) 5215 5216 # Config - snpEff databases 5217 snpeff_databases = ( 5218 config.get("folders", {}) 5219 .get("databases", {}) 5220 .get("snpeff", DEFAULT_SNPEFF_FOLDER) 5221 ) 5222 snpeff_databases = full_path(snpeff_databases) 5223 if snpeff_databases is not None and snpeff_databases != "": 5224 log.debug(f"Create snpEff databases folder") 5225 if not os.path.exists(snpeff_databases): 5226 os.makedirs(snpeff_databases) 5227 5228 # Param 5229 param = self.get_param() 5230 log.debug("Param: " + str(param)) 5231 5232 # Param 5233 options = param.get("annotation", {}).get("snpeff", {}).get("options", None) 5234 log.debug("Options: " + str(options)) 5235 5236 # Param - Assembly 5237 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5238 5239 # Param - Options 5240 snpeff_options = ( 5241 param.get("annotation", {}).get("snpeff", {}).get("options", "") 5242 ) 5243 snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None) 5244 snpeff_csvstats = ( 5245 param.get("annotation", {}).get("snpeff", {}).get("csvStats", None) 5246 ) 5247 if snpeff_stats: 5248 snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output()) 5249 snpeff_stats = full_path(snpeff_stats) 5250 snpeff_options += f" -stats {snpeff_stats}" 5251 if snpeff_csvstats: 5252 snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output()) 5253 snpeff_csvstats = full_path(snpeff_csvstats) 5254 snpeff_options += f" -csvStats {snpeff_csvstats}" 5255 5256 # Data 5257 table_variants = self.get_table_variants() 5258 5259 # Check if not empty 5260 log.debug("Check if not empty") 5261 sql_query_chromosomes = ( 5262 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5263 ) 5264 # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]: 5265 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 5266 log.info(f"VCF empty") 5267 return 5268 5269 # Export in VCF 5270 log.debug("Create initial file to annotate") 5271 tmp_vcf = NamedTemporaryFile( 5272 prefix=self.get_prefix(), 5273 dir=self.get_tmp_dir(), 5274 suffix=".vcf.gz", 5275 delete=True, 5276 ) 5277 tmp_vcf_name = tmp_vcf.name 5278 5279 # VCF header 5280 vcf_reader = self.get_header() 5281 log.debug("Initial header: " + str(vcf_reader.infos)) 5282 5283 # Existing annotations 5284 for vcf_annotation in self.get_header().infos: 5285 5286 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5287 log.debug( 5288 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5289 ) 5290 5291 # Memory limit 5292 # if config.get("memory", None): 5293 # memory_limit = config.get("memory", "8G") 5294 # else: 5295 # memory_limit = "8G" 5296 memory_limit = self.get_memory("8G") 5297 log.debug(f"memory_limit: {memory_limit}") 5298 5299 # snpEff java options 5300 snpeff_java_options = ( 5301 f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} " 5302 ) 5303 log.debug(f"Exomiser java options: {snpeff_java_options}") 5304 5305 force_update_annotation = True 5306 5307 if "ANN" not in self.get_header().infos or force_update_annotation: 5308 5309 # Check snpEff database 5310 log.debug(f"Check snpEff databases {[assembly]}") 5311 databases_download_snpeff( 5312 folder=snpeff_databases, assemblies=[assembly], config=config 5313 ) 5314 5315 # Export VCF file 5316 self.export_variant_vcf( 5317 vcf_file=tmp_vcf_name, 5318 remove_info=True, 5319 add_samples=False, 5320 index=True, 5321 ) 5322 5323 # Tmp file 5324 err_files = [] 5325 tmp_annotate_vcf = NamedTemporaryFile( 5326 prefix=self.get_prefix(), 5327 dir=self.get_tmp_dir(), 5328 suffix=".vcf", 5329 delete=False, 5330 ) 5331 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5332 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5333 err_files.append(tmp_annotate_vcf_name_err) 5334 5335 # Command 5336 snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}" 5337 log.debug(f"Annotation - snpEff command: {snpeff_command}") 5338 run_parallel_commands([snpeff_command], 1) 5339 5340 # Error messages 5341 log.info(f"Error/Warning messages:") 5342 error_message_command_all = [] 5343 error_message_command_warning = [] 5344 error_message_command_err = [] 5345 for err_file in err_files: 5346 with open(err_file, "r") as f: 5347 for line in f: 5348 message = line.strip() 5349 error_message_command_all.append(message) 5350 if line.startswith("[W::"): 5351 error_message_command_warning.append(message) 5352 if line.startswith("[E::"): 5353 error_message_command_err.append(f"{err_file}: " + message) 5354 # log info 5355 for message in list( 5356 set(error_message_command_err + error_message_command_warning) 5357 ): 5358 log.info(f" {message}") 5359 # debug info 5360 for message in list(set(error_message_command_all)): 5361 log.debug(f" {message}") 5362 # failed 5363 if len(error_message_command_err): 5364 log.error("Annotation failed: Error in commands") 5365 raise ValueError("Annotation failed: Error in commands") 5366 5367 # Find annotation in header 5368 with open(tmp_annotate_vcf_name, "rt") as f: 5369 header_list = self.read_vcf_header(f) 5370 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5371 5372 for ann in annovar_vcf_header.infos: 5373 if ann not in self.get_header().infos: 5374 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5375 5376 # Update variants 5377 log.info(f"Annotation - Updating...") 5378 self.update_from_vcf(tmp_annotate_vcf_name) 5379 5380 else: 5381 if "ANN" in self.get_header().infos: 5382 log.debug(f"Existing snpEff annotations in VCF") 5383 if force_update_annotation: 5384 log.debug(f"Existing snpEff annotations in VCF - annotation forced")
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
5386 def annotation_annovar(self, threads: int = None) -> None: 5387 """ 5388 It takes a VCF file, annotates it with Annovar, and then updates the database with the new 5389 annotations 5390 5391 :param threads: number of threads to use 5392 :return: the value of the variable "return_value". 5393 """ 5394 5395 # DEBUG 5396 log.debug("Start annotation with Annovar databases") 5397 5398 # Threads 5399 if not threads: 5400 threads = self.get_threads() 5401 log.debug("Threads: " + str(threads)) 5402 5403 # Tmp en Err files 5404 tmp_files = [] 5405 err_files = [] 5406 5407 # DEBUG 5408 delete_tmp = True 5409 if self.get_config().get("verbosity", "warning") in ["debug"]: 5410 delete_tmp = False 5411 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5412 5413 # Config 5414 config = self.get_config() 5415 log.debug("Config: " + str(config)) 5416 5417 # Config - Folders - Databases 5418 databases_folders = ( 5419 config.get("folders", {}).get("databases", {}).get("annovar", ["."]) 5420 ) 5421 log.debug("Databases annotations: " + str(databases_folders)) 5422 5423 # Config - annovar bin command 5424 annovar_bin_command = get_bin_command( 5425 bin="table_annovar.pl", 5426 tool="annovar", 5427 bin_type="perl", 5428 config=config, 5429 default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar", 5430 ) 5431 if not annovar_bin_command: 5432 msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'" 5433 log.error(msg_err) 5434 raise ValueError(msg_err) 5435 5436 # Config - BCFTools bin command 5437 bcftools_bin_command = get_bin_command( 5438 bin="bcftools", 5439 tool="bcftools", 5440 bin_type="bin", 5441 config=config, 5442 default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools", 5443 ) 5444 if not bcftools_bin_command: 5445 msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'" 5446 log.error(msg_err) 5447 raise ValueError(msg_err) 5448 5449 # Config - annovar databases 5450 annovar_databases = ( 5451 config.get("folders", {}) 5452 .get("databases", {}) 5453 .get("annovar", DEFAULT_ANNOVAR_FOLDER) 5454 ) 5455 if annovar_databases is not None: 5456 if isinstance(annovar_databases, list): 5457 annovar_databases = full_path(annovar_databases[0]) 5458 log.warning(f"Annovar databases folder '{annovar_databases}' selected") 5459 annovar_databases = full_path(annovar_databases) 5460 if not os.path.exists(annovar_databases): 5461 log.info(f"Annovar databases folder '{annovar_databases}' created") 5462 Path(annovar_databases).mkdir(parents=True, exist_ok=True) 5463 else: 5464 msg_err = f"Annovar databases configuration failed" 5465 log.error(msg_err) 5466 raise ValueError(msg_err) 5467 5468 # Param 5469 param = self.get_param() 5470 log.debug("Param: " + str(param)) 5471 5472 # Param - options 5473 options = param.get("annotation", {}).get("annovar", {}).get("options", {}) 5474 log.debug("Options: " + str(options)) 5475 5476 # Param - annotations 5477 annotations = ( 5478 param.get("annotation", {}).get("annovar", {}).get("annotations", {}) 5479 ) 5480 log.debug("Annotations: " + str(annotations)) 5481 5482 # Param - Assembly 5483 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 5484 5485 # Annovar database assembly 5486 annovar_databases_assembly = f"{annovar_databases}/{assembly}" 5487 if annovar_databases_assembly != "" and not os.path.exists( 5488 annovar_databases_assembly 5489 ): 5490 os.makedirs(annovar_databases_assembly) 5491 5492 # Data 5493 table_variants = self.get_table_variants() 5494 5495 # Check if not empty 5496 log.debug("Check if not empty") 5497 sql_query_chromosomes = ( 5498 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 5499 ) 5500 sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes) 5501 if not sql_query_chromosomes_df["count"][0]: 5502 log.info(f"VCF empty") 5503 return 5504 5505 # VCF header 5506 vcf_reader = self.get_header() 5507 log.debug("Initial header: " + str(vcf_reader.infos)) 5508 5509 # Existing annotations 5510 for vcf_annotation in self.get_header().infos: 5511 5512 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5513 log.debug( 5514 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5515 ) 5516 5517 force_update_annotation = True 5518 5519 if annotations: 5520 5521 commands = [] 5522 tmp_annotates_vcf_name_list = [] 5523 5524 # Export in VCF 5525 log.debug("Create initial file to annotate") 5526 tmp_vcf = NamedTemporaryFile( 5527 prefix=self.get_prefix(), 5528 dir=self.get_tmp_dir(), 5529 suffix=".vcf.gz", 5530 delete=False, 5531 ) 5532 tmp_vcf_name = tmp_vcf.name 5533 tmp_files.append(tmp_vcf_name) 5534 tmp_files.append(tmp_vcf_name + ".tbi") 5535 5536 # Export VCF file 5537 self.export_variant_vcf( 5538 vcf_file=tmp_vcf_name, 5539 remove_info=".", 5540 add_samples=False, 5541 index=True, 5542 ) 5543 5544 # Create file for field rename 5545 log.debug("Create file for field rename") 5546 tmp_rename = NamedTemporaryFile( 5547 prefix=self.get_prefix(), 5548 dir=self.get_tmp_dir(), 5549 suffix=".rename", 5550 delete=False, 5551 ) 5552 tmp_rename_name = tmp_rename.name 5553 tmp_files.append(tmp_rename_name) 5554 5555 # Check Annovar database 5556 log.debug( 5557 f"Check Annovar databases {[assembly]}: {list(annotations.keys())}" 5558 ) 5559 databases_download_annovar( 5560 folder=annovar_databases, 5561 files=list(annotations.keys()), 5562 assemblies=[assembly], 5563 ) 5564 5565 for annotation in annotations: 5566 annotation_fields = annotations[annotation] 5567 5568 if not annotation_fields: 5569 annotation_fields = {"INFO": None} 5570 5571 log.info(f"Annotations Annovar - database '{annotation}'") 5572 log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}") 5573 5574 # Tmp file for annovar 5575 err_files = [] 5576 tmp_annotate_vcf_directory = TemporaryDirectory( 5577 prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar" 5578 ) 5579 tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar" 5580 tmp_annotate_vcf_name_annovar = ( 5581 tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf" 5582 ) 5583 tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err" 5584 err_files.append(tmp_annotate_vcf_name_err) 5585 tmp_files.append(tmp_annotate_vcf_name_err) 5586 5587 # Tmp file final vcf annotated by annovar 5588 tmp_annotate_vcf = NamedTemporaryFile( 5589 prefix=self.get_prefix(), 5590 dir=self.get_tmp_dir(), 5591 suffix=".vcf.gz", 5592 delete=False, 5593 ) 5594 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5595 tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name) 5596 tmp_files.append(tmp_annotate_vcf_name) 5597 tmp_files.append(tmp_annotate_vcf_name + ".tbi") 5598 5599 # Number of fields 5600 annotation_list = [] 5601 annotation_renamed_list = [] 5602 5603 for annotation_field in annotation_fields: 5604 5605 # field new name, if parametered SKIPPED !!!!!! not managed actually TODO 5606 annotation_fields_new_name = annotation_fields.get( 5607 annotation_field, annotation_field 5608 ) 5609 if not annotation_fields_new_name: 5610 annotation_fields_new_name = annotation_field 5611 5612 if ( 5613 force_update_annotation 5614 or annotation_fields_new_name not in self.get_header().infos 5615 ): 5616 annotation_list.append(annotation_field) 5617 annotation_renamed_list.append(annotation_fields_new_name) 5618 else: # annotation_fields_new_name in self.get_header().infos and not force_update_annotation: 5619 log.warning( 5620 f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)" 5621 ) 5622 5623 # Add rename info 5624 run_parallel_commands( 5625 [ 5626 f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}" 5627 ], 5628 1, 5629 ) 5630 5631 # log.debug("fields_to_removed: " + str(fields_to_removed)) 5632 log.debug("annotation_list: " + str(annotation_list)) 5633 5634 # protocol 5635 protocol = annotation 5636 5637 # argument 5638 argument = "" 5639 5640 # operation 5641 operation = "f" 5642 if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith( 5643 "ensGene" 5644 ): 5645 operation = "g" 5646 if options.get("genebase", None): 5647 argument = f"""'{options.get("genebase","")}'""" 5648 elif annotation in ["cytoBand"]: 5649 operation = "r" 5650 5651 # argument option 5652 argument_option = "" 5653 if argument != "": 5654 argument_option = " --argument " + argument 5655 5656 # command options 5657 command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """ # --intronhgvs 10 5658 for option in options: 5659 if option not in ["genebase"]: 5660 command_options += f""" --{option}={options[option]}""" 5661 5662 # Command 5663 5664 # Command - Annovar 5665 command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """ 5666 tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf") 5667 5668 # Command - start pipe 5669 command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """ 5670 5671 # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!) 5672 command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """ 5673 5674 # Command - Special characters (refGene annotation) 5675 command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """ 5676 5677 # Command - Clean empty fields (with value ".") 5678 command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """ 5679 5680 # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file 5681 annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"] 5682 if "ALL" not in annotation_list and "INFO" not in annotation_list: 5683 # for ann in annotation_renamed_list: 5684 for ann in annotation_list: 5685 annovar_fields_to_keep.append(f"^INFO/{ann}") 5686 5687 command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """ 5688 5689 # Command - indexing 5690 command_annovar += f""" && tabix {tmp_annotate_vcf_name} """ 5691 5692 log.debug(f"Annotation - Annovar command: {command_annovar}") 5693 run_parallel_commands([command_annovar], 1) 5694 5695 # Error messages 5696 log.info(f"Error/Warning messages:") 5697 error_message_command_all = [] 5698 error_message_command_warning = [] 5699 error_message_command_err = [] 5700 for err_file in err_files: 5701 with open(err_file, "r") as f: 5702 for line in f: 5703 message = line.strip() 5704 error_message_command_all.append(message) 5705 if line.startswith("[W::") or line.startswith("WARNING"): 5706 error_message_command_warning.append(message) 5707 if line.startswith("[E::") or line.startswith("ERROR"): 5708 error_message_command_err.append( 5709 f"{err_file}: " + message 5710 ) 5711 # log info 5712 for message in list( 5713 set(error_message_command_err + error_message_command_warning) 5714 ): 5715 log.info(f" {message}") 5716 # debug info 5717 for message in list(set(error_message_command_all)): 5718 log.debug(f" {message}") 5719 # failed 5720 if len(error_message_command_err): 5721 log.error("Annotation failed: Error in commands") 5722 raise ValueError("Annotation failed: Error in commands") 5723 5724 if tmp_annotates_vcf_name_list: 5725 5726 # List of annotated files 5727 tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list) 5728 5729 # Tmp file 5730 tmp_annotate_vcf = NamedTemporaryFile( 5731 prefix=self.get_prefix(), 5732 dir=self.get_tmp_dir(), 5733 suffix=".vcf.gz", 5734 delete=False, 5735 ) 5736 tmp_annotate_vcf_name = tmp_annotate_vcf.name 5737 tmp_files.append(tmp_annotate_vcf_name) 5738 tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err" 5739 err_files.append(tmp_annotate_vcf_name_err) 5740 tmp_files.append(tmp_annotate_vcf_name_err) 5741 5742 # Command merge 5743 merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} " 5744 log.info( 5745 f"Annotation Annovar - Annotation merging " 5746 + str(len(tmp_annotates_vcf_name_list)) 5747 + " annotated files" 5748 ) 5749 log.debug(f"Annotation - merge command: {merge_command}") 5750 run_parallel_commands([merge_command], 1) 5751 5752 # Find annotation in header 5753 with bgzf.open(tmp_annotate_vcf_name, "rt") as f: 5754 header_list = self.read_vcf_header(f) 5755 annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list))) 5756 5757 for ann in annovar_vcf_header.infos: 5758 if ann not in self.get_header().infos: 5759 vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann) 5760 5761 # Update variants 5762 log.info(f"Annotation Annovar - Updating...") 5763 self.update_from_vcf(tmp_annotate_vcf_name) 5764 5765 # Clean files 5766 # Tmp file remove command 5767 if True: 5768 tmp_files_remove_command = "" 5769 if tmp_files: 5770 tmp_files_remove_command = " ".join(tmp_files) 5771 clean_command = f" rm -f {tmp_files_remove_command} " 5772 log.debug(f"Annotation Annovar - Annotation cleaning ") 5773 log.debug(f"Annotation - cleaning command: {clean_command}") 5774 run_parallel_commands([clean_command], 1)
It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations
Parameters
- threads: number of threads to use
Returns
the value of the variable "return_value".
5777 def annotation_parquet(self, threads: int = None) -> None: 5778 """ 5779 It takes a VCF file, and annotates it with a parquet file 5780 5781 :param threads: number of threads to use for the annotation 5782 :return: the value of the variable "result". 5783 """ 5784 5785 # DEBUG 5786 log.debug("Start annotation with parquet databases") 5787 5788 # Threads 5789 if not threads: 5790 threads = self.get_threads() 5791 log.debug("Threads: " + str(threads)) 5792 5793 # DEBUG 5794 delete_tmp = True 5795 if self.get_config().get("verbosity", "warning") in ["debug"]: 5796 delete_tmp = False 5797 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 5798 5799 # Config 5800 databases_folders = set( 5801 self.get_config() 5802 .get("folders", {}) 5803 .get("databases", {}) 5804 .get("annotations", ["."]) 5805 + self.get_config() 5806 .get("folders", {}) 5807 .get("databases", {}) 5808 .get("parquet", ["."]) 5809 ) 5810 log.debug("Databases annotations: " + str(databases_folders)) 5811 5812 # Param 5813 annotations = ( 5814 self.get_param() 5815 .get("annotation", {}) 5816 .get("parquet", {}) 5817 .get("annotations", None) 5818 ) 5819 log.debug("Annotations: " + str(annotations)) 5820 5821 # Assembly 5822 assembly = self.get_param().get( 5823 "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY) 5824 ) 5825 5826 # Force Update Annotation 5827 force_update_annotation = ( 5828 self.get_param() 5829 .get("annotation", {}) 5830 .get("options", {}) 5831 .get("annotations_update", False) 5832 ) 5833 log.debug(f"force_update_annotation={force_update_annotation}") 5834 force_append_annotation = ( 5835 self.get_param() 5836 .get("annotation", {}) 5837 .get("options", {}) 5838 .get("annotations_append", False) 5839 ) 5840 log.debug(f"force_append_annotation={force_append_annotation}") 5841 5842 # Data 5843 table_variants = self.get_table_variants() 5844 5845 # Check if not empty 5846 log.debug("Check if not empty") 5847 sql_query_chromosomes_df = self.get_query_to_df( 5848 f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1""" 5849 ) 5850 if not sql_query_chromosomes_df["count"][0]: 5851 log.info(f"VCF empty") 5852 return 5853 5854 # VCF header 5855 vcf_reader = self.get_header() 5856 log.debug("Initial header: " + str(vcf_reader.infos)) 5857 5858 # Nb Variants POS 5859 log.debug("NB Variants Start") 5860 nb_variants = self.conn.execute( 5861 f"SELECT count(*) AS count FROM variants" 5862 ).fetchdf()["count"][0] 5863 log.debug("NB Variants Stop") 5864 5865 # Existing annotations 5866 for vcf_annotation in self.get_header().infos: 5867 5868 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 5869 log.debug( 5870 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 5871 ) 5872 5873 # Added columns 5874 added_columns = [] 5875 5876 # drop indexes 5877 log.debug(f"Drop indexes...") 5878 self.drop_indexes() 5879 5880 if annotations: 5881 5882 if "ALL" in annotations: 5883 5884 all_param = annotations.get("ALL", {}) 5885 all_param_formats = all_param.get("formats", None) 5886 all_param_releases = all_param.get("releases", None) 5887 5888 databases_infos_dict = self.scan_databases( 5889 database_formats=all_param_formats, 5890 database_releases=all_param_releases, 5891 ) 5892 for database_infos in databases_infos_dict.keys(): 5893 if database_infos not in annotations: 5894 annotations[database_infos] = {"INFO": None} 5895 5896 for annotation in annotations: 5897 5898 if annotation in ["ALL"]: 5899 continue 5900 5901 # Annotation Name 5902 annotation_name = os.path.basename(annotation) 5903 5904 # Annotation fields 5905 annotation_fields = annotations[annotation] 5906 if not annotation_fields: 5907 annotation_fields = {"INFO": None} 5908 5909 log.debug(f"Annotation '{annotation_name}'") 5910 log.debug( 5911 f"Annotation '{annotation_name}' - fields: {annotation_fields}" 5912 ) 5913 5914 # Create Database 5915 database = Database( 5916 database=annotation, 5917 databases_folders=databases_folders, 5918 assembly=assembly, 5919 ) 5920 5921 # Find files 5922 parquet_file = database.get_database() 5923 parquet_hdr_file = database.get_header_file() 5924 parquet_type = database.get_type() 5925 5926 # Check if files exists 5927 if not parquet_file or not parquet_hdr_file: 5928 msg_err_list = [] 5929 if not parquet_file: 5930 msg_err_list.append( 5931 f"Annotation failed: Annotation file not found" 5932 ) 5933 if parquet_file and not parquet_hdr_file: 5934 msg_err_list.append( 5935 f"Annotation failed: Annotation file '{parquet_file}' header not found. Check for file '{parquet_file}.hdr'" 5936 ) 5937 5938 log.error(". ".join(msg_err_list)) 5939 raise ValueError(". ".join(msg_err_list)) 5940 else: 5941 # Get parquet connexion 5942 parquet_sql_attach = database.get_sql_database_attach( 5943 output="query" 5944 ) 5945 if parquet_sql_attach: 5946 self.conn.execute(parquet_sql_attach) 5947 parquet_file_link = database.get_sql_database_link() 5948 # Log 5949 log.debug( 5950 f"Annotation '{annotation_name}' - file: " 5951 + str(parquet_file) 5952 + " and " 5953 + str(parquet_hdr_file) 5954 ) 5955 5956 # Database full header columns 5957 parquet_hdr_vcf_header_columns = database.get_header_file_columns( 5958 parquet_hdr_file 5959 ) 5960 # Log 5961 log.debug( 5962 "Annotation database header columns : " 5963 + str(parquet_hdr_vcf_header_columns) 5964 ) 5965 5966 # Load header as VCF object 5967 parquet_hdr_vcf_header_infos = database.get_header().infos 5968 # Log 5969 log.debug( 5970 "Annotation database header: " 5971 + str(parquet_hdr_vcf_header_infos) 5972 ) 5973 5974 # Get extra infos 5975 parquet_columns = database.get_extra_columns() 5976 # Log 5977 log.debug("Annotation database Columns: " + str(parquet_columns)) 5978 5979 # Add extra columns if "ALL" in annotation_fields 5980 # if "ALL" in annotation_fields: 5981 # allow_add_extra_column = True 5982 if "ALL" in annotation_fields and database.get_extra_columns(): 5983 for extra_column in database.get_extra_columns(): 5984 if ( 5985 extra_column not in annotation_fields 5986 and extra_column.replace("INFO/", "") 5987 not in parquet_hdr_vcf_header_infos 5988 ): 5989 parquet_hdr_vcf_header_infos[extra_column] = ( 5990 vcf.parser._Info( 5991 extra_column, 5992 ".", 5993 "String", 5994 f"{extra_column} description", 5995 "unknown", 5996 "unknown", 5997 self.code_type_map["String"], 5998 ) 5999 ) 6000 6001 # For all fields in database 6002 annotation_fields_all = False 6003 if "ALL" in annotation_fields or "INFO" in annotation_fields: 6004 annotation_fields_all = True 6005 annotation_fields = { 6006 key: key for key in parquet_hdr_vcf_header_infos 6007 } 6008 6009 log.debug( 6010 "Annotation database header - All annotations added: " 6011 + str(annotation_fields) 6012 ) 6013 6014 # Init 6015 6016 # List of annotation fields to use 6017 sql_query_annotation_update_info_sets = [] 6018 6019 # List of annotation to agregate 6020 sql_query_annotation_to_agregate = [] 6021 6022 # Number of fields 6023 nb_annotation_field = 0 6024 6025 # Annotation fields processed 6026 annotation_fields_processed = [] 6027 6028 # Columns mapping 6029 map_columns = database.map_columns( 6030 columns=annotation_fields, prefixes=["INFO/"] 6031 ) 6032 6033 # Query dict for fields to remove (update option) 6034 query_dict_remove = {} 6035 6036 # Fetch Anotation fields 6037 for annotation_field in annotation_fields: 6038 6039 # annotation_field_column 6040 annotation_field_column = map_columns.get( 6041 annotation_field, "INFO" 6042 ) 6043 6044 # field new name, if parametered 6045 annotation_fields_new_name = annotation_fields.get( 6046 annotation_field, annotation_field 6047 ) 6048 if not annotation_fields_new_name: 6049 annotation_fields_new_name = annotation_field 6050 6051 # To annotate 6052 # force_update_annotation = True 6053 # force_append_annotation = True 6054 # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)): 6055 if annotation_field in parquet_hdr_vcf_header_infos and ( 6056 force_update_annotation 6057 or force_append_annotation 6058 or ( 6059 annotation_fields_new_name 6060 not in self.get_header().infos 6061 ) 6062 ): 6063 6064 # Add field to annotation to process list 6065 annotation_fields_processed.append( 6066 annotation_fields_new_name 6067 ) 6068 6069 # explode infos for the field 6070 annotation_fields_new_name_info_msg = "" 6071 if ( 6072 force_update_annotation 6073 and annotation_fields_new_name 6074 in self.get_header().infos 6075 ): 6076 # Remove field from INFO 6077 query = f""" 6078 UPDATE {table_variants} as table_variants 6079 SET INFO = REGEXP_REPLACE( 6080 concat(table_variants.INFO,''), 6081 ';*{annotation_fields_new_name}=[^;]*', 6082 '' 6083 ) 6084 WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%' 6085 """ 6086 annotation_fields_new_name_info_msg = " [update]" 6087 query_dict_remove[ 6088 f"remove 'INFO/{annotation_fields_new_name}'" 6089 ] = query 6090 6091 # Sep between fields in INFO 6092 nb_annotation_field += 1 6093 if nb_annotation_field > 1: 6094 annotation_field_sep = ";" 6095 else: 6096 annotation_field_sep = "" 6097 6098 log.info( 6099 f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}" 6100 ) 6101 6102 # Add INFO field to header 6103 parquet_hdr_vcf_header_infos_number = ( 6104 parquet_hdr_vcf_header_infos[annotation_field].num 6105 or "." 6106 ) 6107 parquet_hdr_vcf_header_infos_type = ( 6108 parquet_hdr_vcf_header_infos[annotation_field].type 6109 or "String" 6110 ) 6111 parquet_hdr_vcf_header_infos_description = ( 6112 parquet_hdr_vcf_header_infos[annotation_field].desc 6113 or f"{annotation_field} description" 6114 ) 6115 parquet_hdr_vcf_header_infos_source = ( 6116 parquet_hdr_vcf_header_infos[annotation_field].source 6117 or "unknown" 6118 ) 6119 parquet_hdr_vcf_header_infos_version = ( 6120 parquet_hdr_vcf_header_infos[annotation_field].version 6121 or "unknown" 6122 ) 6123 6124 vcf_reader.infos[annotation_fields_new_name] = ( 6125 vcf.parser._Info( 6126 annotation_fields_new_name, 6127 parquet_hdr_vcf_header_infos_number, 6128 parquet_hdr_vcf_header_infos_type, 6129 parquet_hdr_vcf_header_infos_description, 6130 parquet_hdr_vcf_header_infos_source, 6131 parquet_hdr_vcf_header_infos_version, 6132 self.code_type_map[ 6133 parquet_hdr_vcf_header_infos_type 6134 ], 6135 ) 6136 ) 6137 6138 # Append 6139 if force_append_annotation: 6140 query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """ 6141 else: 6142 query_case_when_append = "" 6143 6144 # Annotation/Update query fields 6145 # Found in INFO column 6146 if ( 6147 annotation_field_column == "INFO" 6148 and "INFO" in parquet_hdr_vcf_header_columns 6149 ): 6150 sql_query_annotation_update_info_sets.append( 6151 f""" 6152 CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append} 6153 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1)) 6154 ELSE '' 6155 END 6156 """ 6157 ) 6158 # Found in a specific column 6159 else: 6160 sql_query_annotation_update_info_sets.append( 6161 f""" 6162 CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append} 6163 THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ',')) 6164 ELSE '' 6165 END 6166 """ 6167 ) 6168 sql_query_annotation_to_agregate.append( 6169 f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """ 6170 ) 6171 6172 # Not to annotate 6173 else: 6174 6175 if force_update_annotation: 6176 annotation_message = "forced" 6177 else: 6178 annotation_message = "skipped" 6179 6180 if annotation_field not in parquet_hdr_vcf_header_infos: 6181 log.warning( 6182 f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file" 6183 ) 6184 if annotation_fields_new_name in self.get_header().infos: 6185 log.warning( 6186 f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})" 6187 ) 6188 6189 # Check if ALL fields have to be annotated. Thus concat all INFO field 6190 # allow_annotation_full_info = True 6191 allow_annotation_full_info = not force_append_annotation 6192 6193 if parquet_type in ["regions"]: 6194 allow_annotation_full_info = False 6195 6196 if ( 6197 allow_annotation_full_info 6198 and nb_annotation_field == len(annotation_fields) 6199 and annotation_fields_all 6200 and ( 6201 "INFO" in parquet_hdr_vcf_header_columns 6202 and "INFO" in database.get_extra_columns() 6203 ) 6204 ): 6205 log.debug("Column INFO annotation enabled") 6206 sql_query_annotation_update_info_sets = [] 6207 sql_query_annotation_update_info_sets.append( 6208 f" table_parquet.INFO " 6209 ) 6210 6211 if sql_query_annotation_update_info_sets: 6212 6213 # Annotate 6214 log.info(f"Annotation '{annotation_name}' - Annotation...") 6215 6216 # Join query annotation update info sets for SQL 6217 sql_query_annotation_update_info_sets_sql = ",".join( 6218 sql_query_annotation_update_info_sets 6219 ) 6220 6221 # Check chromosomes list (and variants infos) 6222 sql_query_chromosomes = f""" 6223 SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants 6224 FROM {table_variants} as table_variants 6225 GROUP BY table_variants."#CHROM" 6226 ORDER BY table_variants."#CHROM" 6227 """ 6228 sql_query_chromosomes_df = self.conn.execute( 6229 sql_query_chromosomes 6230 ).df() 6231 sql_query_chromosomes_dict = { 6232 entry["CHROM"]: { 6233 "count": entry["count_variants"], 6234 "min": entry["min_variants"], 6235 "max": entry["max_variants"], 6236 } 6237 for index, entry in sql_query_chromosomes_df.iterrows() 6238 } 6239 6240 # Init 6241 nb_of_query = 0 6242 nb_of_variant_annotated = 0 6243 query_dict = query_dict_remove 6244 6245 # for chrom in sql_query_chromosomes_df["CHROM"]: 6246 for chrom in sql_query_chromosomes_dict: 6247 6248 # Number of variant by chromosome 6249 nb_of_variant_by_chrom = sql_query_chromosomes_dict.get( 6250 chrom, {} 6251 ).get("count", 0) 6252 6253 log.debug( 6254 f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..." 6255 ) 6256 6257 # Annotation with regions database 6258 if parquet_type in ["regions"]: 6259 sql_query_annotation_from_clause = f""" 6260 FROM ( 6261 SELECT 6262 '{chrom}' AS \"#CHROM\", 6263 table_variants_from.\"POS\" AS \"POS\", 6264 {",".join(sql_query_annotation_to_agregate)} 6265 FROM {table_variants} as table_variants_from 6266 LEFT JOIN {parquet_file_link} as table_parquet_from ON ( 6267 table_parquet_from."#CHROM" = '{chrom}' 6268 AND table_variants_from.\"POS\" <= table_parquet_from.\"END\" 6269 AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1) 6270 OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1) 6271 ) 6272 ) 6273 WHERE table_variants_from.\"#CHROM\" in ('{chrom}') 6274 GROUP BY table_variants_from.\"POS\" 6275 ) 6276 as table_parquet 6277 """ 6278 6279 sql_query_annotation_where_clause = """ 6280 table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6281 AND table_parquet.\"POS\" = table_variants.\"POS\" 6282 """ 6283 6284 # Annotation with variants database 6285 else: 6286 sql_query_annotation_from_clause = f""" 6287 FROM {parquet_file_link} as table_parquet 6288 """ 6289 sql_query_annotation_where_clause = f""" 6290 table_variants."#CHROM" = '{chrom}' 6291 AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 6292 AND table_parquet.\"POS\" = table_variants.\"POS\" 6293 AND table_parquet.\"ALT\" = table_variants.\"ALT\" 6294 AND table_parquet.\"REF\" = table_variants.\"REF\" 6295 """ 6296 6297 # Create update query 6298 sql_query_annotation_chrom_interval_pos = f""" 6299 UPDATE {table_variants} as table_variants 6300 SET INFO = 6301 concat( 6302 CASE WHEN table_variants.INFO NOT IN ('','.') 6303 THEN table_variants.INFO 6304 ELSE '' 6305 END 6306 , 6307 CASE WHEN table_variants.INFO NOT IN ('','.') 6308 AND ( 6309 concat({sql_query_annotation_update_info_sets_sql}) 6310 ) 6311 NOT IN ('','.') 6312 THEN ';' 6313 ELSE '' 6314 END 6315 , 6316 {sql_query_annotation_update_info_sets_sql} 6317 ) 6318 {sql_query_annotation_from_clause} 6319 WHERE {sql_query_annotation_where_clause} 6320 ; 6321 """ 6322 6323 # Add update query to dict 6324 query_dict[ 6325 f"{chrom} [{nb_of_variant_by_chrom} variants]" 6326 ] = sql_query_annotation_chrom_interval_pos 6327 6328 nb_of_query = len(query_dict) 6329 num_query = 0 6330 6331 # SET max_expression_depth TO x 6332 self.conn.execute("SET max_expression_depth TO 10000") 6333 6334 for query_name in query_dict: 6335 query = query_dict[query_name] 6336 num_query += 1 6337 log.info( 6338 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..." 6339 ) 6340 result = self.conn.execute(query) 6341 nb_of_variant_annotated_by_query = result.df()["Count"][0] 6342 nb_of_variant_annotated += nb_of_variant_annotated_by_query 6343 log.info( 6344 f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated" 6345 ) 6346 6347 log.info( 6348 f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)" 6349 ) 6350 6351 else: 6352 6353 log.info( 6354 f"Annotation '{annotation_name}' - No Annotations available" 6355 ) 6356 6357 log.debug("Final header: " + str(vcf_reader.infos)) 6358 6359 # Remove added columns 6360 for added_column in added_columns: 6361 self.drop_column(column=added_column)
It takes a VCF file, and annotates it with a parquet file
Parameters
- threads: number of threads to use for the annotation
Returns
the value of the variable "result".
6363 def annotation_splice(self, threads: int = None) -> None: 6364 """ 6365 This function annotate with snpEff 6366 6367 :param threads: The number of threads to use 6368 :return: the value of the variable "return_value". 6369 """ 6370 6371 # DEBUG 6372 log.debug("Start annotation with splice tools") 6373 6374 # Threads 6375 if not threads: 6376 threads = self.get_threads() 6377 log.debug("Threads: " + str(threads)) 6378 6379 # DEBUG 6380 delete_tmp = True 6381 if self.get_config().get("verbosity", "warning") in ["debug"]: 6382 delete_tmp = False 6383 log.debug("Delete tmp files/folders: " + str(delete_tmp)) 6384 6385 # Config 6386 config = self.get_config() 6387 log.debug("Config: " + str(config)) 6388 splice_config = config.get("tools", {}).get("splice", {}) 6389 if not splice_config: 6390 splice_config = DEFAULT_TOOLS_BIN.get("splice", {}) 6391 msg_err = "No Splice tool config" 6392 raise ValueError(msg_err) 6393 log.debug(f"splice_config: {splice_config}") 6394 6395 # Config - Folders - Databases 6396 databases_folders = ( 6397 config.get("folders", {}).get("databases", {}).get("splice", ["."]) 6398 ) 6399 log.debug("Databases annotations: " + str(databases_folders)) 6400 6401 # Splice docker image 6402 splice_docker_image = splice_config.get("docker").get("image") 6403 6404 # Pull splice image if it's not already there 6405 if not check_docker_image_exists(splice_docker_image): 6406 log.warning( 6407 f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub" 6408 ) 6409 try: 6410 command(f"docker pull {splice_config.get('docker').get('image')}") 6411 except subprocess.CalledProcessError: 6412 msg_err = f"Unable to find docker {splice_docker_image} on dockerhub" 6413 log.error(msg_err) 6414 raise ValueError(msg_err) 6415 6416 # Config - splice databases 6417 splice_databases = ( 6418 config.get("folders", {}) 6419 .get("databases", {}) 6420 .get("splice", DEFAULT_SPLICE_FOLDER) 6421 ) 6422 splice_databases = full_path(splice_databases) 6423 6424 # Param 6425 param = self.get_param() 6426 log.debug("Param: " + str(param)) 6427 6428 # Param 6429 options = param.get("annotation", {}).get("splice", {}).get("options", {}) 6430 log.debug("Options: " + str(options)) 6431 6432 # Data 6433 table_variants = self.get_table_variants() 6434 6435 # Check if not empty 6436 log.debug("Check if not empty") 6437 sql_query_chromosomes = ( 6438 f"""SELECT count(*) as count FROM {table_variants} as table_variants""" 6439 ) 6440 if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]: 6441 log.info("VCF empty") 6442 return None 6443 6444 # Export in VCF 6445 log.debug("Create initial file to annotate") 6446 6447 # Create output folder / work folder 6448 if options.get("output_folder", ""): 6449 output_folder = options.get("output_folder", "") 6450 if not os.path.exists(output_folder): 6451 Path(output_folder).mkdir(parents=True, exist_ok=True) 6452 else: 6453 output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}") 6454 if not os.path.exists(output_folder): 6455 Path(output_folder).mkdir(parents=True, exist_ok=True) 6456 6457 if options.get("workdir", ""): 6458 workdir = options.get("workdir", "") 6459 else: 6460 workdir = "/work" 6461 6462 # Create tmp VCF file 6463 tmp_vcf = NamedTemporaryFile( 6464 prefix=self.get_prefix(), 6465 dir=output_folder, 6466 suffix=".vcf", 6467 delete=False, 6468 ) 6469 tmp_vcf_name = tmp_vcf.name 6470 6471 # VCF header 6472 header = self.get_header() 6473 6474 # Existing annotations 6475 for vcf_annotation in self.get_header().infos: 6476 6477 vcf_annotation_line = self.get_header().infos.get(vcf_annotation) 6478 log.debug( 6479 f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]" 6480 ) 6481 6482 # Memory limit 6483 if config.get("memory", None): 6484 memory_limit = config.get("memory", "8G").upper() 6485 # upper() 6486 else: 6487 memory_limit = "8G" 6488 log.debug(f"memory_limit: {memory_limit}") 6489 6490 # Check number of variants to annotate 6491 where_clause_regex_spliceai = r"SpliceAI_\w+" 6492 where_clause_regex_spip = r"SPiP_\w+" 6493 where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')""" 6494 df_list_of_variants_to_annotate = self.get_query_to_df( 6495 query=f""" SELECT * FROM variants {where_clause} """ 6496 ) 6497 if len(df_list_of_variants_to_annotate) == 0: 6498 log.warning( 6499 f"No variants to annotate with splice. Variants probably already annotated with splice" 6500 ) 6501 return None 6502 else: 6503 log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants") 6504 6505 # Export VCF file 6506 self.export_variant_vcf( 6507 vcf_file=tmp_vcf_name, 6508 remove_info=True, 6509 add_samples=True, 6510 index=False, 6511 where_clause=where_clause, 6512 ) 6513 mount = [f" -v {path}:{path}:rw" for path in [output_folder]] 6514 if any(value for value in splice_config.values() if value is None): 6515 log.warning("At least one splice config parameter is empty") 6516 # exit annotation_splice 6517 return None 6518 6519 # Params in splice nf 6520 def check_values(dico: dict): 6521 """ 6522 Ensure parameters for NF splice pipeline 6523 """ 6524 for key, val in dico.items(): 6525 if key == "genome": 6526 if any( 6527 assemb in options.get("genome", {}) 6528 for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"] 6529 ): 6530 yield f"--{key} hg19" 6531 elif any( 6532 assemb in options.get("genome", {}) 6533 for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"] 6534 ): 6535 yield f"--{key} hg38" 6536 elif ( 6537 (isinstance(val, str) and val) 6538 or isinstance(val, int) 6539 or isinstance(val, bool) 6540 ): 6541 yield f"--{key} {val}" 6542 6543 # Genome 6544 genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY)) 6545 options["genome"] = genome 6546 # NF params 6547 nf_params = [] 6548 # Add options 6549 if options: 6550 log.debug(options) 6551 nf_params = list(check_values(options)) 6552 log.debug(f"Splice NF params: {' '.join(nf_params)}") 6553 else: 6554 log.debug("No NF params provided") 6555 # Add threads 6556 if "threads" not in options.keys(): 6557 nf_params.append(f"--threads {threads}") 6558 # Genome path 6559 genome_path = find_genome( 6560 config.get("folders", {}) 6561 .get("databases", {}) 6562 .get("genomes", DEFAULT_GENOME_FOLDER), 6563 file=f"{genome}.fa", 6564 ) 6565 # Add genome path 6566 if not genome_path: 6567 raise ValueError( 6568 f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}" 6569 ) 6570 else: 6571 log.debug(f"Genome: {genome_path}") 6572 nf_params.append(f"--genome_path {genome_path}") 6573 6574 def splice_annotations(options: dict = {}, config: dict = {}) -> list: 6575 """ 6576 Setting up updated databases for SPiP and SpliceAI 6577 """ 6578 6579 try: 6580 6581 # SpliceAI assembly transcriptome 6582 spliceai_assembly = os.path.join( 6583 config.get("folders", {}).get("databases", {}).get("spliceai", {}), 6584 options.get("genome"), 6585 "transcriptome", 6586 ) 6587 spip_assembly = options.get("genome") 6588 6589 spip = find( 6590 f"transcriptome_{spip_assembly}.RData", 6591 config.get("folders", {}).get("databases", {}).get("spip", {}), 6592 ) 6593 spliceai = find("spliceai.refseq.txt", spliceai_assembly) 6594 log.debug(f"SPiP annotations: {spip}") 6595 log.debug(f"SpliceAI annotations: {spliceai}") 6596 if spip and spliceai: 6597 return [ 6598 f"--spip_transcriptome {spip}", 6599 f"--spliceai_transcriptome {spliceai}", 6600 ] 6601 else: 6602 log.warning( 6603 "Can't find splice databases in configuration, use annotations file from image" 6604 ) 6605 except TypeError: 6606 log.warning( 6607 "Can't find splice databases in configuration, use annotations file from image" 6608 ) 6609 return [] 6610 6611 # Add options, check if transcriptome option have already beend provided 6612 if ( 6613 "spip_transcriptome" not in nf_params 6614 and "spliceai_transcriptome" not in nf_params 6615 ): 6616 splice_reference = splice_annotations(options, config) 6617 if splice_reference: 6618 nf_params.extend(splice_reference) 6619 # nf_params.append(f"--output_folder {output_folder}") 6620 random_uuid = f"HOWARD-SPLICE-{get_random()}" 6621 cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline" 6622 log.debug(cmd) 6623 splice_config["docker"]["command"] = cmd 6624 6625 # Ensure proxy is set 6626 proxy = [ 6627 f"-e {var}={os.getenv(var)}" 6628 for var in ["https_proxy", "http_proxy", "ftp_proxy"] 6629 if os.getenv(var) is not None 6630 ] 6631 docker_cmd = get_bin_command( 6632 tool="splice", 6633 bin_type="docker", 6634 config=config, 6635 default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker", 6636 add_options=f"--name {random_uuid} {' '.join(mount)} -e NXF_DISABLE_CHECK_LATEST=true {' '.join(proxy)}", 6637 ) 6638 # print(docker_cmd) 6639 # exit() 6640 # Docker debug 6641 # if splice_config.get("rm_container"): 6642 # rm_container = "--rm" 6643 # else: 6644 # rm_container = "" 6645 # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}" 6646 log.debug(docker_cmd) 6647 res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True) 6648 log.debug(res.stdout) 6649 if res.stderr: 6650 log.error(res.stderr) 6651 res.check_returncode() 6652 # Update variants 6653 log.info("Annotation - Updating...") 6654 # Test find output vcf 6655 log.debug( 6656 f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6657 ) 6658 output_vcf = [] 6659 # Wrong folder to look in 6660 for files in os.listdir(os.path.dirname(tmp_vcf_name)): 6661 if ( 6662 files 6663 == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz" 6664 ): 6665 output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files)) 6666 # log.debug(os.listdir(options.get("output_folder"))) 6667 log.debug(f"Splice annotated vcf: {output_vcf[0]}") 6668 if not output_vcf: 6669 log.debug( 6670 f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz" 6671 ) 6672 else: 6673 # Get new header from annotated vcf 6674 log.debug(f"Initial header: {len(header.infos)} fields") 6675 # Create new header with splice infos 6676 new_vcf = Variants(input=output_vcf[0]) 6677 new_vcf_header = new_vcf.get_header().infos 6678 for keys, infos in new_vcf_header.items(): 6679 if keys not in header.infos.keys(): 6680 header.infos[keys] = infos 6681 log.debug(f"New header: {len(header.infos)} fields") 6682 log.debug(f"Splice tmp output: {output_vcf[0]}") 6683 self.update_from_vcf(output_vcf[0]) 6684 6685 # Remove file 6686 remove_if_exists(output_vcf)
This function annotate with snpEff
Parameters
- threads: The number of threads to use
Returns
the value of the variable "return_value".
6692 def get_config_default(self, name: str) -> dict: 6693 """ 6694 The function `get_config_default` returns a dictionary containing default configurations for 6695 various calculations and prioritizations. 6696 6697 :param name: The `get_config_default` function returns a dictionary containing default 6698 configurations for different calculations and prioritizations. The `name` parameter is used to 6699 specify which specific configuration to retrieve from the dictionary 6700 :type name: str 6701 :return: The function `get_config_default` returns a dictionary containing default configuration 6702 settings for different calculations and prioritizations. The specific configuration settings are 6703 retrieved based on the input `name` parameter provided to the function. If the `name` parameter 6704 matches a key in the `config_default` dictionary, the corresponding configuration settings are 6705 returned. If there is no match, an empty dictionary is returned. 6706 """ 6707 6708 config_default = { 6709 "calculations": { 6710 "variant_chr_pos_alt_ref": { 6711 "type": "sql", 6712 "name": "variant_chr_pos_alt_ref", 6713 "description": "Create a variant ID with chromosome, position, alt and ref", 6714 "available": False, 6715 "output_column_name": "variant_chr_pos_alt_ref", 6716 "output_column_type": "String", 6717 "output_column_description": "variant ID with chromosome, position, alt and ref", 6718 "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """, 6719 "operation_info": True, 6720 }, 6721 "VARTYPE": { 6722 "type": "sql", 6723 "name": "VARTYPE", 6724 "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)", 6725 "available": True, 6726 "output_column_name": "VARTYPE", 6727 "output_column_type": "String", 6728 "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ", 6729 "operation_query": """ 6730 CASE 6731 WHEN "SVTYPE" NOT NULL THEN "SVTYPE" 6732 WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV' 6733 WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC' 6734 WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV' 6735 WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL' 6736 ELSE 'UNDEFINED' 6737 END 6738 """, 6739 "info_fields": ["SVTYPE"], 6740 "operation_info": True, 6741 }, 6742 "snpeff_hgvs": { 6743 "type": "python", 6744 "name": "snpeff_hgvs", 6745 "description": "HGVS nomenclatures from snpEff annotation", 6746 "available": True, 6747 "function_name": "calculation_extract_snpeff_hgvs", 6748 "function_params": ["snpeff_hgvs", "ANN"], 6749 }, 6750 "snpeff_ann_explode": { 6751 "type": "python", 6752 "name": "snpeff_ann_explode", 6753 "description": "Explode snpEff annotations with uniquify values", 6754 "available": True, 6755 "function_name": "calculation_snpeff_ann_explode", 6756 "function_params": [False, "fields", "snpeff_", "ANN"], 6757 }, 6758 "snpeff_ann_explode_uniquify": { 6759 "type": "python", 6760 "name": "snpeff_ann_explode_uniquify", 6761 "description": "Explode snpEff annotations", 6762 "available": True, 6763 "function_name": "calculation_snpeff_ann_explode", 6764 "function_params": [True, "fields", "snpeff_uniquify_", "ANN"], 6765 }, 6766 "snpeff_ann_explode_json": { 6767 "type": "python", 6768 "name": "snpeff_ann_explode_json", 6769 "description": "Explode snpEff annotations in JSON format", 6770 "available": True, 6771 "function_name": "calculation_snpeff_ann_explode", 6772 "function_params": [False, "JSON", "snpeff_json", "ANN"], 6773 }, 6774 "NOMEN": { 6775 "type": "python", 6776 "name": "NOMEN", 6777 "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field (see parameters help)", 6778 "available": True, 6779 "function_name": "calculation_extract_nomen", 6780 "function_params": [], 6781 }, 6782 "FINDBYPIPELINE": { 6783 "type": "python", 6784 "name": "FINDBYPIPELINE", 6785 "description": "Number of pipeline that identify the variant (for multi pipeline VCF)", 6786 "available": True, 6787 "function_name": "calculation_find_by_pipeline", 6788 "function_params": ["findbypipeline"], 6789 }, 6790 "FINDBYSAMPLE": { 6791 "type": "python", 6792 "name": "FINDBYSAMPLE", 6793 "description": "Number of sample that have a genotype for the variant (for multi sample VCF)", 6794 "available": True, 6795 "function_name": "calculation_find_by_pipeline", 6796 "function_params": ["findbysample"], 6797 }, 6798 "GENOTYPECONCORDANCE": { 6799 "type": "python", 6800 "name": "GENOTYPECONCORDANCE", 6801 "description": "Concordance of genotype for multi caller VCF", 6802 "available": True, 6803 "function_name": "calculation_genotype_concordance", 6804 "function_params": [], 6805 }, 6806 "BARCODE": { 6807 "type": "python", 6808 "name": "BARCODE", 6809 "description": "BARCODE as VaRank tool", 6810 "available": True, 6811 "function_name": "calculation_barcode", 6812 "function_params": [], 6813 }, 6814 "BARCODEFAMILY": { 6815 "type": "python", 6816 "name": "BARCODEFAMILY", 6817 "description": "BARCODEFAMILY as VaRank tool", 6818 "available": True, 6819 "function_name": "calculation_barcode_family", 6820 "function_params": ["BCF"], 6821 }, 6822 "TRIO": { 6823 "type": "python", 6824 "name": "TRIO", 6825 "description": "Inheritance for a trio family", 6826 "available": True, 6827 "function_name": "calculation_trio", 6828 "function_params": [], 6829 }, 6830 "VAF": { 6831 "type": "python", 6832 "name": "VAF", 6833 "description": "Variant Allele Frequency (VAF) harmonization", 6834 "available": True, 6835 "function_name": "calculation_vaf_normalization", 6836 "function_params": [], 6837 }, 6838 "VAF_stats": { 6839 "type": "python", 6840 "name": "VAF_stats", 6841 "description": "Variant Allele Frequency (VAF) statistics", 6842 "available": True, 6843 "function_name": "calculation_genotype_stats", 6844 "function_params": ["VAF"], 6845 }, 6846 "DP_stats": { 6847 "type": "python", 6848 "name": "DP_stats", 6849 "description": "Depth (DP) statistics", 6850 "available": True, 6851 "function_name": "calculation_genotype_stats", 6852 "function_params": ["DP"], 6853 }, 6854 "variant_id": { 6855 "type": "python", 6856 "name": "variant_id", 6857 "description": "Variant ID generated from variant position and type", 6858 "available": True, 6859 "function_name": "calculation_variant_id", 6860 "function_params": [], 6861 }, 6862 "transcripts_json": { 6863 "type": "python", 6864 "name": "transcripts_json", 6865 "description": "Add transcripts annotations in JSON format (field 'transcripts_json')", 6866 "available": True, 6867 "function_name": "calculation_transcripts_annotation", 6868 "function_params": ["transcripts_json", None], 6869 }, 6870 "transcripts_ann": { 6871 "type": "python", 6872 "name": "transcripts_ann", 6873 "description": "Add transcripts annotations in structured format (field 'transcripts_ann')", 6874 "available": True, 6875 "function_name": "calculation_transcripts_annotation", 6876 "function_params": [None, "transcripts_ann"], 6877 }, 6878 "transcripts_annotations": { 6879 "type": "python", 6880 "name": "transcripts_annotations", 6881 "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)", 6882 "available": True, 6883 "function_name": "calculation_transcripts_annotation", 6884 "function_params": [None, None], 6885 }, 6886 "transcripts_prioritization": { 6887 "type": "python", 6888 "name": "transcripts_prioritization", 6889 "description": "Prioritize transcripts with a prioritization profile (using param.json)", 6890 "available": True, 6891 "function_name": "calculation_transcripts_prioritization", 6892 "function_params": [], 6893 }, 6894 "transcripts_export": { 6895 "type": "python", 6896 "name": "transcripts_export", 6897 "description": "Export transcripts table/view as a file (using param.json)", 6898 "available": True, 6899 "function_name": "calculation_transcripts_export", 6900 "function_params": [], 6901 }, 6902 }, 6903 "prioritizations": { 6904 "default": { 6905 "ANN2": [ 6906 { 6907 "type": "contains", 6908 "value": "HIGH", 6909 "score": 5, 6910 "flag": "PASS", 6911 "comment": [ 6912 "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay" 6913 ], 6914 }, 6915 { 6916 "type": "contains", 6917 "value": "MODERATE", 6918 "score": 3, 6919 "flag": "PASS", 6920 "comment": [ 6921 "A non-disruptive variant that might change protein effectiveness" 6922 ], 6923 }, 6924 { 6925 "type": "contains", 6926 "value": "LOW", 6927 "score": 0, 6928 "flag": "FILTERED", 6929 "comment": [ 6930 "Assumed to be mostly harmless or unlikely to change protein behavior" 6931 ], 6932 }, 6933 { 6934 "type": "contains", 6935 "value": "MODIFIER", 6936 "score": 0, 6937 "flag": "FILTERED", 6938 "comment": [ 6939 "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact" 6940 ], 6941 }, 6942 ], 6943 } 6944 }, 6945 } 6946 6947 return config_default.get(name, None)
The function get_config_default returns a dictionary containing default configurations for
various calculations and prioritizations.
Parameters
- name: The
get_config_defaultfunction returns a dictionary containing default configurations for different calculations and prioritizations. Thenameparameter is used to specify which specific configuration to retrieve from the dictionary
Returns
The function
get_config_defaultreturns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the inputnameparameter provided to the function. If thenameparameter matches a key in theconfig_defaultdictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.
6949 def get_config_json( 6950 self, name: str, config_dict: dict = {}, config_file: str = None 6951 ) -> dict: 6952 """ 6953 The function `get_config_json` retrieves a configuration JSON object with prioritizations from 6954 default values, a dictionary, and a file. 6955 6956 :param name: The `name` parameter in the `get_config_json` function is a string that represents 6957 the name of the configuration. It is used to identify and retrieve the configuration settings 6958 for a specific component or module 6959 :type name: str 6960 :param config_dict: The `config_dict` parameter in the `get_config_json` function is a 6961 dictionary that allows you to provide additional configuration settings or overrides. When you 6962 call the `get_config_json` function, you can pass a dictionary containing key-value pairs where 6963 the key is the configuration setting you want to override or 6964 :type config_dict: dict 6965 :param config_file: The `config_file` parameter in the `get_config_json` function is used to 6966 specify the path to a configuration file that contains additional settings. If provided, the 6967 function will read the contents of this file and update the configuration dictionary with the 6968 values found in the file, overriding any existing values with the 6969 :type config_file: str 6970 :return: The function `get_config_json` returns a dictionary containing the configuration 6971 settings. 6972 """ 6973 6974 # Create with default prioritizations 6975 config_default = self.get_config_default(name=name) 6976 configuration = config_default 6977 # log.debug(f"configuration={configuration}") 6978 6979 # Replace prioritizations from dict 6980 for config in config_dict: 6981 configuration[config] = config_dict[config] 6982 6983 # Replace prioritizations from file 6984 config_file = full_path(config_file) 6985 if config_file: 6986 if os.path.exists(config_file): 6987 with open(config_file) as config_file_content: 6988 config_file_dict = json.load(config_file_content) 6989 for config in config_file_dict: 6990 configuration[config] = config_file_dict[config] 6991 else: 6992 msg_error = f"Config '{name}' file '{config_file}' does NOT exist" 6993 log.error(msg_error) 6994 raise ValueError(msg_error) 6995 6996 return configuration
The function get_config_json retrieves a configuration JSON object with prioritizations from
default values, a dictionary, and a file.
Parameters
- name: The
nameparameter in theget_config_jsonfunction is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module - config_dict: The
config_dictparameter in theget_config_jsonfunction is a dictionary that allows you to provide additional configuration settings or overrides. When you call theget_config_jsonfunction, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or - config_file: The
config_fileparameter in theget_config_jsonfunction is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns
The function
get_config_jsonreturns a dictionary containing the configuration settings.
6998 def prioritization( 6999 self, table: str = None, pz_prefix: str = None, pz_param: dict = None 7000 ) -> bool: 7001 """ 7002 The `prioritization` function in Python processes VCF files, adds new INFO fields, and 7003 prioritizes variants based on configured profiles and criteria. 7004 7005 :param table: The `table` parameter in the `prioritization` function is used to specify the name 7006 of the table (presumably a VCF file) on which the prioritization operation will be performed. If 7007 a table name is provided, the method will prioritize the variants in that specific table 7008 :type table: str 7009 :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to 7010 certain INFO fields in a VCF file during the prioritization process. If this parameter is not 7011 provided, the code will use a default prefix value of "PZ" 7012 :type pz_prefix: str 7013 :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass 7014 additional parameters specific to the prioritization process. These parameters can include 7015 settings related to prioritization profiles, fields, scoring modes, flags, comments, and other 7016 configurations needed for the prioritization of variants in a V 7017 :type pz_param: dict 7018 :return: A boolean value (True) is being returned from the `prioritization` function. 7019 """ 7020 7021 # Config 7022 config = self.get_config() 7023 7024 # Param 7025 param = self.get_param() 7026 7027 # Prioritization param 7028 if pz_param is not None: 7029 prioritization_param = pz_param 7030 else: 7031 prioritization_param = param.get("prioritization", {}) 7032 7033 # Configuration profiles 7034 prioritization_config_file = prioritization_param.get( 7035 "prioritization_config", None 7036 ) 7037 prioritization_config_file = full_path(prioritization_config_file) 7038 prioritizations_config = self.get_config_json( 7039 name="prioritizations", config_file=prioritization_config_file 7040 ) 7041 7042 # Prioritization prefix 7043 pz_prefix_default = "PZ" 7044 if pz_prefix is None: 7045 pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default) 7046 7047 # Prioritization options 7048 profiles = prioritization_param.get("profiles", []) 7049 if isinstance(profiles, str): 7050 profiles = profiles.split(",") 7051 pzfields = prioritization_param.get( 7052 "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"] 7053 ) 7054 if isinstance(pzfields, str): 7055 pzfields = pzfields.split(",") 7056 default_profile = prioritization_param.get("default_profile", None) 7057 pzfields_sep = prioritization_param.get("pzfields_sep", "_") 7058 prioritization_score_mode = prioritization_param.get( 7059 "prioritization_score_mode", "HOWARD" 7060 ) 7061 7062 # Quick Prioritizations 7063 prioritizations = param.get("prioritizations", None) 7064 if prioritizations: 7065 log.info("Quick Prioritization:") 7066 for profile in prioritizations.split(","): 7067 if profile not in profiles: 7068 profiles.append(profile) 7069 log.info(f" {profile}") 7070 7071 # If profile "ALL" provided, all profiles in the config profiles 7072 if "ALL" in profiles: 7073 profiles = list(prioritizations_config.keys()) 7074 7075 for profile in profiles: 7076 if prioritizations_config.get(profile, None): 7077 log.debug(f"Profile '{profile}' configured") 7078 else: 7079 msg_error = f"Profile '{profile}' NOT configured" 7080 log.error(msg_error) 7081 raise ValueError(msg_error) 7082 7083 if profiles: 7084 log.info(f"Prioritization... ") 7085 else: 7086 log.debug(f"No profile defined") 7087 return False 7088 7089 if not default_profile and len(profiles): 7090 default_profile = profiles[0] 7091 7092 log.debug("Profiles availables: " + str(list(prioritizations_config.keys()))) 7093 log.debug("Profiles to check: " + str(list(profiles))) 7094 7095 # Variables 7096 if table is not None: 7097 table_variants = table 7098 else: 7099 table_variants = self.get_table_variants(clause="update") 7100 log.debug(f"Table to prioritize: {table_variants}") 7101 7102 # Added columns 7103 added_columns = [] 7104 7105 # Create list of PZfields 7106 # List of PZFields 7107 list_of_pzfields_original = pzfields + [ 7108 pzfield + pzfields_sep + profile 7109 for pzfield in pzfields 7110 for profile in profiles 7111 ] 7112 list_of_pzfields = [] 7113 log.debug(f"{list_of_pzfields_original}") 7114 7115 # Remove existing PZfields to use if exists 7116 for pzfield in list_of_pzfields_original: 7117 if self.get_header().infos.get(pzfield, None) is None: 7118 list_of_pzfields.append(pzfield) 7119 log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF") 7120 else: 7121 log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF") 7122 7123 if list_of_pzfields: 7124 7125 # Explode Infos prefix 7126 explode_infos_prefix = self.get_explode_infos_prefix() 7127 7128 # PZfields tags description 7129 PZfields_INFOS = { 7130 f"{pz_prefix}Tags": { 7131 "ID": f"{pz_prefix}Tags", 7132 "Number": ".", 7133 "Type": "String", 7134 "Description": "Variant tags based on annotation criteria", 7135 }, 7136 f"{pz_prefix}Score": { 7137 "ID": f"{pz_prefix}Score", 7138 "Number": 1, 7139 "Type": "Integer", 7140 "Description": "Variant score based on annotation criteria", 7141 }, 7142 f"{pz_prefix}Flag": { 7143 "ID": f"{pz_prefix}Flag", 7144 "Number": 1, 7145 "Type": "String", 7146 "Description": "Variant flag based on annotation criteria", 7147 }, 7148 f"{pz_prefix}Comment": { 7149 "ID": f"{pz_prefix}Comment", 7150 "Number": ".", 7151 "Type": "String", 7152 "Description": "Variant comment based on annotation criteria", 7153 }, 7154 f"{pz_prefix}Infos": { 7155 "ID": f"{pz_prefix}Infos", 7156 "Number": ".", 7157 "Type": "String", 7158 "Description": "Variant infos based on annotation criteria", 7159 }, 7160 f"{pz_prefix}Class": { 7161 "ID": f"{pz_prefix}Class", 7162 "Number": ".", 7163 "Type": "String", 7164 "Description": "Variant class based on annotation criteria", 7165 }, 7166 } 7167 7168 # Create INFO fields if not exist 7169 for field in PZfields_INFOS: 7170 field_ID = PZfields_INFOS[field]["ID"] 7171 field_description = PZfields_INFOS[field]["Description"] 7172 if field_ID not in self.get_header().infos and field_ID in pzfields: 7173 field_description = ( 7174 PZfields_INFOS[field]["Description"] 7175 + f", profile {default_profile}" 7176 ) 7177 self.get_header().infos[field_ID] = vcf.parser._Info( 7178 field_ID, 7179 PZfields_INFOS[field]["Number"], 7180 PZfields_INFOS[field]["Type"], 7181 field_description, 7182 "unknown", 7183 "unknown", 7184 code_type_map[PZfields_INFOS[field]["Type"]], 7185 ) 7186 7187 # Create INFO fields if not exist for each profile 7188 for profile in prioritizations_config: 7189 if profile in profiles or profiles == []: 7190 for field in PZfields_INFOS: 7191 field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile 7192 field_description = ( 7193 PZfields_INFOS[field]["Description"] 7194 + f", profile {profile}" 7195 ) 7196 if ( 7197 field_ID not in self.get_header().infos 7198 and field in pzfields 7199 ): 7200 self.get_header().infos[field_ID] = vcf.parser._Info( 7201 field_ID, 7202 PZfields_INFOS[field]["Number"], 7203 PZfields_INFOS[field]["Type"], 7204 field_description, 7205 "unknown", 7206 "unknown", 7207 code_type_map[PZfields_INFOS[field]["Type"]], 7208 ) 7209 7210 # Header 7211 for pzfield in list_of_pzfields: 7212 if re.match(f"{pz_prefix}Score.*", pzfield): 7213 added_column = self.add_column( 7214 table_name=table_variants, 7215 column_name=pzfield, 7216 column_type="INTEGER", 7217 default_value="0", 7218 ) 7219 elif re.match(f"{pz_prefix}Flag.*", pzfield): 7220 added_column = self.add_column( 7221 table_name=table_variants, 7222 column_name=pzfield, 7223 column_type="BOOLEAN", 7224 default_value="1", 7225 ) 7226 elif re.match(f"{pz_prefix}Class.*", pzfield): 7227 added_column = self.add_column( 7228 table_name=table_variants, 7229 column_name=pzfield, 7230 column_type="VARCHAR[]", 7231 default_value="null", 7232 ) 7233 else: 7234 added_column = self.add_column( 7235 table_name=table_variants, 7236 column_name=pzfield, 7237 column_type="STRING", 7238 default_value="''", 7239 ) 7240 added_columns.append(added_column) 7241 7242 # Profiles 7243 if profiles: 7244 7245 # foreach profile in configuration file 7246 for profile in prioritizations_config: 7247 7248 # If profile is asked in param, or ALL are asked (empty profile []) 7249 if profile in profiles or profiles == []: 7250 log.info(f"Profile '{profile}'") 7251 7252 sql_set_info_option = "" 7253 7254 sql_set_info = [] 7255 7256 # PZ fields set 7257 7258 # PZScore 7259 if ( 7260 f"{pz_prefix}Score{pzfields_sep}{profile}" 7261 in list_of_pzfields 7262 ): 7263 sql_set_info.append( 7264 f""" 7265 concat( 7266 '{pz_prefix}Score{pzfields_sep}{profile}=', 7267 {pz_prefix}Score{pzfields_sep}{profile} 7268 ) 7269 """ 7270 ) 7271 if ( 7272 profile == default_profile 7273 and f"{pz_prefix}Score" in list_of_pzfields 7274 ): 7275 sql_set_info.append( 7276 f""" 7277 concat( 7278 '{pz_prefix}Score=', 7279 {pz_prefix}Score{pzfields_sep}{profile} 7280 ) 7281 """ 7282 ) 7283 7284 # PZFlag 7285 if ( 7286 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7287 in list_of_pzfields 7288 ): 7289 sql_set_info.append( 7290 f""" 7291 concat( 7292 '{pz_prefix}Flag{pzfields_sep}{profile}=', 7293 CASE 7294 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7295 THEN 'PASS' 7296 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7297 THEN 'FILTERED' 7298 END 7299 ) 7300 """ 7301 ) 7302 if ( 7303 profile == default_profile 7304 and f"{pz_prefix}Flag" in list_of_pzfields 7305 ): 7306 sql_set_info.append( 7307 f""" 7308 concat( 7309 '{pz_prefix}Flag=', 7310 CASE 7311 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1 7312 THEN 'PASS' 7313 WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0 7314 THEN 'FILTERED' 7315 END 7316 ) 7317 """ 7318 ) 7319 7320 # PZClass 7321 if ( 7322 f"{pz_prefix}Class{pzfields_sep}{profile}" 7323 in list_of_pzfields 7324 ): 7325 sql_set_info.append( 7326 f""" 7327 concat( 7328 '{pz_prefix}Class{pzfields_sep}{profile}=', 7329 CASE 7330 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7331 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7332 ELSE '.' 7333 END 7334 ) 7335 7336 """ 7337 ) 7338 if ( 7339 profile == default_profile 7340 and f"{pz_prefix}Class" in list_of_pzfields 7341 ): 7342 sql_set_info.append( 7343 f""" 7344 concat( 7345 '{pz_prefix}Class=', 7346 CASE 7347 WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7348 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7349 ELSE '.' 7350 END 7351 ) 7352 """ 7353 ) 7354 7355 # PZComment 7356 if ( 7357 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7358 in list_of_pzfields 7359 ): 7360 sql_set_info.append( 7361 f""" 7362 CASE 7363 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7364 THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile}) 7365 ELSE '' 7366 END 7367 """ 7368 ) 7369 if ( 7370 profile == default_profile 7371 and f"{pz_prefix}Comment" in list_of_pzfields 7372 ): 7373 sql_set_info.append( 7374 f""" 7375 CASE 7376 WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('') 7377 THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile}) 7378 ELSE '' 7379 END 7380 """ 7381 ) 7382 7383 # PZInfos 7384 if ( 7385 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7386 in list_of_pzfields 7387 ): 7388 sql_set_info.append( 7389 f""" 7390 CASE 7391 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7392 THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile}) 7393 ELSE '' 7394 END 7395 """ 7396 ) 7397 if ( 7398 profile == default_profile 7399 and f"{pz_prefix}Infos" in list_of_pzfields 7400 ): 7401 sql_set_info.append( 7402 f""" 7403 CASE 7404 WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('') 7405 THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile}) 7406 ELSE '' 7407 END 7408 """ 7409 ) 7410 7411 # Merge PZfields 7412 sql_set_info_option = "" 7413 sql_set_sep = "" 7414 for sql_set in sql_set_info: 7415 if sql_set_sep: 7416 sql_set_info_option += f""" 7417 , concat('{sql_set_sep}', {sql_set}) 7418 """ 7419 else: 7420 sql_set_info_option += f""" 7421 , {sql_set} 7422 """ 7423 sql_set_sep = ";" 7424 7425 sql_queries = [] 7426 for annotation in prioritizations_config[profile]: 7427 7428 # skip special sections 7429 if annotation.startswith("_"): 7430 continue 7431 7432 # For each criterions 7433 for criterion in prioritizations_config[profile][ 7434 annotation 7435 ]: 7436 7437 # Criterion mode 7438 criterion_mode = None 7439 if np.any( 7440 np.isin(list(criterion.keys()), ["type", "value"]) 7441 ): 7442 criterion_mode = "operation" 7443 elif np.any( 7444 np.isin(list(criterion.keys()), ["sql", "fields"]) 7445 ): 7446 criterion_mode = "sql" 7447 log.debug(f"Criterion Mode: {criterion_mode}") 7448 7449 # Criterion parameters 7450 criterion_type = criterion.get("type", None) 7451 criterion_value = criterion.get("value", None) 7452 criterion_sql = criterion.get("sql", None) 7453 criterion_fields = criterion.get("fields", None) 7454 criterion_score = criterion.get("score", 0) 7455 criterion_flag = criterion.get("flag", "PASS") 7456 criterion_class = criterion.get("class", None) 7457 criterion_flag_bool = criterion_flag == "PASS" 7458 criterion_comment = ( 7459 ", ".join(criterion.get("comment", [])) 7460 .replace("'", "''") 7461 .replace(";", ",") 7462 .replace("\t", " ") 7463 ) 7464 criterion_infos = ( 7465 str(criterion) 7466 .replace("'", "''") 7467 .replace(";", ",") 7468 .replace("\t", " ") 7469 ) 7470 7471 # SQL 7472 if criterion_sql is not None and isinstance( 7473 criterion_sql, list 7474 ): 7475 criterion_sql = " ".join(criterion_sql) 7476 7477 # Fields and explode 7478 if criterion_fields is None: 7479 criterion_fields = [annotation] 7480 if not isinstance(criterion_fields, list): 7481 criterion_fields = str(criterion_fields).split(",") 7482 7483 # Class 7484 if criterion_class is not None and not isinstance( 7485 criterion_class, list 7486 ): 7487 criterion_class = str(criterion_class).split(",") 7488 7489 for annotation_field in criterion_fields: 7490 7491 # Explode specific annotation 7492 log.debug( 7493 f"Explode annotation '{annotation_field}'" 7494 ) 7495 added_columns += self.explode_infos( 7496 prefix=explode_infos_prefix, 7497 fields=[annotation_field], 7498 table=table_variants, 7499 ) 7500 extra_infos = self.get_extra_infos( 7501 table=table_variants 7502 ) 7503 7504 # Check if annotation field is present 7505 if ( 7506 f"{explode_infos_prefix}{annotation_field}" 7507 not in extra_infos 7508 ): 7509 msq_err = f"Annotation '{annotation_field}' not in data" 7510 log.error(msq_err) 7511 raise ValueError(msq_err) 7512 else: 7513 log.debug( 7514 f"Annotation '{annotation_field}' in data" 7515 ) 7516 7517 sql_set = [] 7518 sql_set_info = [] 7519 7520 # PZ fields set 7521 7522 # PZScore 7523 if ( 7524 f"{pz_prefix}Score{pzfields_sep}{profile}" 7525 in list_of_pzfields 7526 ): 7527 # if prioritization_score_mode == "HOWARD": 7528 # sql_set.append( 7529 # f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7530 # ) 7531 # VaRank prioritization score mode 7532 if prioritization_score_mode == "VaRank": 7533 sql_set.append( 7534 f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END" 7535 ) 7536 # default HOWARD prioritization score mode 7537 else: 7538 sql_set.append( 7539 f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}" 7540 ) 7541 7542 # PZFlag 7543 if ( 7544 f"{pz_prefix}Flag{pzfields_sep}{profile}" 7545 in list_of_pzfields 7546 ): 7547 sql_set.append( 7548 f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}" 7549 ) 7550 7551 # PZClass 7552 if ( 7553 f"{pz_prefix}Class{pzfields_sep}{profile}" 7554 in list_of_pzfields 7555 and criterion_class is not None 7556 ): 7557 sql_set.append( 7558 f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) " 7559 ) 7560 7561 # PZComment 7562 if ( 7563 f"{pz_prefix}Comment{pzfields_sep}{profile}" 7564 in list_of_pzfields 7565 ): 7566 sql_set.append( 7567 f""" 7568 {pz_prefix}Comment{pzfields_sep}{profile} = 7569 concat( 7570 {pz_prefix}Comment{pzfields_sep}{profile}, 7571 CASE 7572 WHEN {pz_prefix}Comment{pzfields_sep}{profile}!='' 7573 THEN ', ' 7574 ELSE '' 7575 END, 7576 '{criterion_comment}' 7577 ) 7578 """ 7579 ) 7580 7581 # PZInfos 7582 if ( 7583 f"{pz_prefix}Infos{pzfields_sep}{profile}" 7584 in list_of_pzfields 7585 ): 7586 sql_set.append( 7587 f""" 7588 {pz_prefix}Infos{pzfields_sep}{profile} = 7589 concat( 7590 {pz_prefix}Infos{pzfields_sep}{profile}, 7591 '{criterion_infos}' 7592 ) 7593 """ 7594 ) 7595 sql_set_option = ",".join(sql_set) 7596 7597 # Criterion and comparison 7598 if sql_set_option: 7599 7600 if criterion_mode in ["operation"]: 7601 7602 try: 7603 float(criterion_value) 7604 sql_update = f""" 7605 UPDATE {table_variants} 7606 SET {sql_set_option} 7607 WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.') 7608 AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value} 7609 """ 7610 except: 7611 contains_option = "" 7612 if criterion_type == "contains": 7613 contains_option = ".*" 7614 sql_update = f""" 7615 UPDATE {table_variants} 7616 SET {sql_set_option} 7617 WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}' 7618 """ 7619 sql_queries.append(sql_update) 7620 7621 elif criterion_mode in ["sql"]: 7622 7623 sql_update = f""" 7624 UPDATE {table_variants} 7625 SET {sql_set_option} 7626 WHERE {criterion_sql} 7627 """ 7628 sql_queries.append(sql_update) 7629 7630 else: 7631 msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')" 7632 log.error(msg_err) 7633 raise ValueError(msg_err) 7634 7635 else: 7636 log.warning( 7637 f"NO SQL SET option for '{annotation}' - '{criterion}'" 7638 ) 7639 7640 # PZTags 7641 if ( 7642 f"{pz_prefix}Tags{pzfields_sep}{profile}" 7643 in list_of_pzfields 7644 ): 7645 7646 # Create PZFalgs value 7647 pztags_value = "" 7648 pztags_sep_default = "," 7649 pztags_sep = "" 7650 for pzfield in pzfields: 7651 if pzfield not in [f"{pz_prefix}Tags"]: 7652 if ( 7653 f"{pzfield}{pzfields_sep}{profile}" 7654 in list_of_pzfields 7655 ): 7656 if pzfield in [f"{pz_prefix}Flag"]: 7657 pztags_value += f"""{pztags_sep}{pzfield}#', 7658 CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile} 7659 THEN 'PASS' 7660 ELSE 'FILTERED' 7661 END, '""" 7662 elif pzfield in [f"{pz_prefix}Class"]: 7663 pztags_value += f"""{pztags_sep}{pzfield}#', 7664 CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0 7665 THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',') 7666 ELSE '.' 7667 END, '""" 7668 else: 7669 pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '" 7670 pztags_sep = pztags_sep_default 7671 7672 # Add Query update for PZFlags 7673 sql_update_pztags = f""" 7674 UPDATE {table_variants} 7675 SET INFO = concat( 7676 INFO, 7677 CASE WHEN INFO NOT in ('','.') 7678 THEN ';' 7679 ELSE '' 7680 END, 7681 '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}' 7682 ) 7683 """ 7684 sql_queries.append(sql_update_pztags) 7685 7686 # Add Query update for PZFlags for default 7687 if profile == default_profile: 7688 sql_update_pztags_default = f""" 7689 UPDATE {table_variants} 7690 SET INFO = concat( 7691 INFO, 7692 ';', 7693 '{pz_prefix}Tags={pztags_value}' 7694 ) 7695 """ 7696 sql_queries.append(sql_update_pztags_default) 7697 7698 log.info(f"""Profile '{profile}' - Prioritization... """) 7699 7700 if sql_queries: 7701 7702 for sql_query in sql_queries: 7703 log.debug( 7704 f"""Profile '{profile}' - Prioritization query: {sql_query}... """ 7705 ) 7706 self.conn.execute(sql_query) 7707 7708 log.info(f"""Profile '{profile}' - Update... """) 7709 sql_query_update = f""" 7710 UPDATE {table_variants} 7711 SET INFO = 7712 concat( 7713 CASE 7714 WHEN INFO NOT IN ('','.') 7715 THEN concat(INFO, ';') 7716 ELSE '' 7717 END 7718 {sql_set_info_option} 7719 ) 7720 """ 7721 self.conn.execute(sql_query_update) 7722 7723 else: 7724 7725 log.warning(f"No profiles in parameters") 7726 7727 # Remove added columns 7728 for added_column in added_columns: 7729 self.drop_column(column=added_column) 7730 7731 # Explode INFOS fields into table fields 7732 if self.get_explode_infos(): 7733 self.explode_infos( 7734 prefix=self.get_explode_infos_prefix(), 7735 fields=self.get_explode_infos_fields(), 7736 force=True, 7737 ) 7738 7739 return True
The prioritization function in Python processes VCF files, adds new INFO fields, and
prioritizes variants based on configured profiles and criteria.
Parameters
- table: The
tableparameter in theprioritizationfunction is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table - pz_prefix: The
pz_prefixparameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ" - pz_param: The
pz_paramparameter in theprioritizationmethod is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns
A boolean value (True) is being returned from the
prioritizationfunction.
7745 def annotation_hgvs(self, threads: int = None) -> None: 7746 """ 7747 The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic 7748 coordinates and alleles. 7749 7750 :param threads: The `threads` parameter is an optional integer that specifies the number of 7751 threads to use for parallel processing. If no value is provided, it will default to the number 7752 of threads obtained from the `get_threads()` method 7753 :type threads: int 7754 """ 7755 7756 # Function for each partition of the Dask Dataframe 7757 def partition_function(partition): 7758 """ 7759 The function `partition_function` applies the `annotation_hgvs_partition` function to 7760 each row of a DataFrame called `partition`. 7761 7762 :param partition: The parameter "partition" is a pandas DataFrame that contains the data 7763 to be processed 7764 :return: the result of applying the "annotation_hgvs_partition" function to each row of 7765 the "partition" dataframe along the axis 1. 7766 """ 7767 return partition.apply(annotation_hgvs_partition, axis=1) 7768 7769 def annotation_hgvs_partition(row) -> str: 7770 """ 7771 The function `annotation_hgvs_partition` takes in a row of data and returns a string 7772 containing a list of HGVS names associated with the given genomic coordinates and alleles. 7773 7774 :param row: A dictionary-like object that contains the values for the following keys: 7775 :return: a string that contains the HGVS names associated with the given row of data. 7776 """ 7777 7778 chr = row["CHROM"] 7779 pos = row["POS"] 7780 ref = row["REF"] 7781 alt = row["ALT"] 7782 7783 # Find list of associated transcripts 7784 transcripts_list = list( 7785 polars_conn.execute( 7786 f""" 7787 SELECT transcript 7788 FROM refseq_df 7789 WHERE CHROM='{chr}' 7790 AND POS={pos} 7791 """ 7792 )["transcript"] 7793 ) 7794 7795 # Full HGVS annotation in list 7796 hgvs_full_list = [] 7797 7798 for transcript_name in transcripts_list: 7799 7800 # Transcript 7801 transcript = get_transcript( 7802 transcripts=transcripts, transcript_name=transcript_name 7803 ) 7804 # Exon 7805 if use_exon: 7806 exon = transcript.find_exon_number(pos) 7807 else: 7808 exon = None 7809 # Protein 7810 transcript_protein = None 7811 if use_protein or add_protein or full_format: 7812 transcripts_protein = list( 7813 polars_conn.execute( 7814 f""" 7815 SELECT protein 7816 FROM refseqlink_df 7817 WHERE transcript='{transcript_name}' 7818 LIMIT 1 7819 """ 7820 )["protein"] 7821 ) 7822 if len(transcripts_protein): 7823 transcript_protein = transcripts_protein[0] 7824 7825 # HGVS name 7826 hgvs_name = format_hgvs_name( 7827 chr, 7828 pos, 7829 ref, 7830 alt, 7831 genome=genome, 7832 transcript=transcript, 7833 transcript_protein=transcript_protein, 7834 exon=exon, 7835 use_gene=use_gene, 7836 use_protein=use_protein, 7837 full_format=full_format, 7838 use_version=use_version, 7839 codon_type=codon_type, 7840 ) 7841 hgvs_full_list.append(hgvs_name) 7842 if add_protein and not use_protein and not full_format: 7843 hgvs_name = format_hgvs_name( 7844 chr, 7845 pos, 7846 ref, 7847 alt, 7848 genome=genome, 7849 transcript=transcript, 7850 transcript_protein=transcript_protein, 7851 exon=exon, 7852 use_gene=use_gene, 7853 use_protein=True, 7854 full_format=False, 7855 use_version=use_version, 7856 codon_type=codon_type, 7857 ) 7858 hgvs_full_list.append(hgvs_name) 7859 7860 # Create liste of HGVS annotations 7861 hgvs_full = ",".join(hgvs_full_list) 7862 7863 return hgvs_full 7864 7865 # Polars connexion 7866 polars_conn = pl.SQLContext(register_globals=True, eager=True) 7867 7868 # Config 7869 config = self.get_config() 7870 7871 # Databases 7872 # Genome 7873 databases_genomes_folders = ( 7874 config.get("folders", {}) 7875 .get("databases", {}) 7876 .get("genomes", DEFAULT_GENOME_FOLDER) 7877 ) 7878 databases_genome = ( 7879 config.get("folders", {}).get("databases", {}).get("genomes", "") 7880 ) 7881 # refseq database folder 7882 databases_refseq_folders = ( 7883 config.get("folders", {}) 7884 .get("databases", {}) 7885 .get("refseq", DEFAULT_REFSEQ_FOLDER) 7886 ) 7887 # refseq 7888 databases_refseq = config.get("databases", {}).get("refSeq", None) 7889 # refSeqLink 7890 databases_refseqlink = config.get("databases", {}).get("refSeqLink", None) 7891 7892 # Param 7893 param = self.get_param() 7894 7895 # Quick HGVS 7896 if "hgvs_options" in param and param.get("hgvs_options", ""): 7897 log.info(f"Quick HGVS Annotation:") 7898 if not param.get("hgvs", None): 7899 param["hgvs"] = {} 7900 for option in param.get("hgvs_options", "").split(","): 7901 option_var_val = option.split("=") 7902 option_var = option_var_val[0] 7903 if len(option_var_val) > 1: 7904 option_val = option_var_val[1] 7905 else: 7906 option_val = "True" 7907 if option_val.upper() in ["TRUE"]: 7908 option_val = True 7909 elif option_val.upper() in ["FALSE"]: 7910 option_val = False 7911 log.info(f" {option_var}={option_val}") 7912 param["hgvs"][option_var] = option_val 7913 7914 # Check if HGVS annotation enabled 7915 if "hgvs" in param: 7916 log.info(f"HGVS Annotation... ") 7917 for hgvs_option in param.get("hgvs", {}): 7918 log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}") 7919 else: 7920 return 7921 7922 # HGVS Param 7923 param_hgvs = param.get("hgvs", {}) 7924 use_exon = param_hgvs.get("use_exon", False) 7925 use_gene = param_hgvs.get("use_gene", False) 7926 use_protein = param_hgvs.get("use_protein", False) 7927 add_protein = param_hgvs.get("add_protein", False) 7928 full_format = param_hgvs.get("full_format", False) 7929 use_version = param_hgvs.get("use_version", False) 7930 codon_type = param_hgvs.get("codon_type", "3") 7931 7932 # refSseq refSeqLink 7933 databases_refseq = param_hgvs.get("refseq", databases_refseq) 7934 databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink) 7935 7936 # Assembly 7937 assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY)) 7938 7939 # Genome 7940 genome_file = None 7941 if find_genome(databases_genome): 7942 genome_file = find_genome(databases_genome) 7943 else: 7944 genome_file = find_genome( 7945 genome_path=databases_genomes_folders, assembly=assembly 7946 ) 7947 log.debug("Genome: " + str(genome_file)) 7948 7949 # refSseq 7950 refseq_file = find_file_prefix( 7951 input_file=databases_refseq, 7952 prefix="ncbiRefSeq", 7953 folder=databases_refseq_folders, 7954 assembly=assembly, 7955 ) 7956 log.debug("refSeq: " + str(refseq_file)) 7957 7958 # refSeqLink 7959 refseqlink_file = find_file_prefix( 7960 input_file=databases_refseqlink, 7961 prefix="ncbiRefSeqLink", 7962 folder=databases_refseq_folders, 7963 assembly=assembly, 7964 ) 7965 log.debug("refSeqLink: " + str(refseqlink_file)) 7966 7967 # Threads 7968 if not threads: 7969 threads = self.get_threads() 7970 log.debug("Threads: " + str(threads)) 7971 7972 # Variables 7973 table_variants = self.get_table_variants(clause="update") 7974 7975 # Get variants SNV and InDel only 7976 query_variants = f""" 7977 SELECT "#CHROM" AS CHROM, POS, REF, ALT 7978 FROM {table_variants} 7979 WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$' 7980 """ 7981 df_variants = self.get_query_to_df(query_variants) 7982 7983 # Added columns 7984 added_columns = [] 7985 7986 # Add hgvs column in variants table 7987 hgvs_column_name = "hgvs_" + str(random.randrange(1000)) 7988 added_column = self.add_column( 7989 table_variants, hgvs_column_name, "STRING", default_value=None 7990 ) 7991 added_columns.append(added_column) 7992 7993 log.debug(f"refSeq loading...") 7994 # refSeq in duckDB 7995 refseq_table = get_refseq_table( 7996 conn=self.conn, refseq_table="refseq", refseq_file=refseq_file 7997 ) 7998 # Loading all refSeq in Dataframe 7999 refseq_query = f""" 8000 SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript 8001 FROM {refseq_table} 8002 JOIN df_variants ON ( 8003 {refseq_table}.chrom = df_variants.CHROM 8004 AND {refseq_table}.txStart<=df_variants.POS 8005 AND {refseq_table}.txEnd>=df_variants.POS 8006 ) 8007 """ 8008 refseq_df = self.conn.query(refseq_query).pl() 8009 8010 if refseqlink_file: 8011 log.debug(f"refSeqLink loading...") 8012 # refSeqLink in duckDB 8013 refseqlink_table = get_refseq_table( 8014 conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file 8015 ) 8016 # Loading all refSeqLink in Dataframe 8017 protacc_column = "protAcc_with_ver" 8018 mrnaacc_column = "mrnaAcc_with_ver" 8019 refseqlink_query = f""" 8020 SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript 8021 FROM {refseqlink_table} 8022 JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver) 8023 WHERE protAcc_without_ver IS NOT NULL 8024 """ 8025 # Polars Dataframe 8026 refseqlink_df = self.conn.query(f"{refseqlink_query}").pl() 8027 8028 # Read RefSeq transcripts into a python dict/model. 8029 log.debug(f"Transcripts loading...") 8030 with tempfile.TemporaryDirectory() as tmpdir: 8031 transcripts_query = f""" 8032 COPY ( 8033 SELECT {refseq_table}.* 8034 FROM {refseq_table} 8035 JOIN df_variants ON ( 8036 {refseq_table}.chrom=df_variants.CHROM 8037 AND {refseq_table}.txStart<=df_variants.POS 8038 AND {refseq_table}.txEnd>=df_variants.POS 8039 ) 8040 ) 8041 TO '{tmpdir}/transcript.tsv' (DELIMITER '\t'); 8042 """ 8043 self.conn.query(transcripts_query) 8044 with open(f"{tmpdir}/transcript.tsv") as infile: 8045 transcripts = read_transcripts(infile) 8046 8047 # Polars connexion 8048 polars_conn = pl.SQLContext(register_globals=True, eager=True) 8049 8050 log.debug("Genome loading...") 8051 # Read genome sequence using pyfaidx. 8052 genome = Fasta(genome_file) 8053 8054 log.debug("Start annotation HGVS...") 8055 8056 # Create 8057 # a Dask Dataframe from Pandas dataframe with partition as number of threads 8058 ddf = dd.from_pandas(df_variants, npartitions=threads) 8059 8060 # Use dask.dataframe.apply() to apply function on each partition 8061 ddf[hgvs_column_name] = ddf.map_partitions(partition_function) 8062 8063 # Convert Dask DataFrame to Pandas Dataframe 8064 df = ddf.compute() 8065 8066 # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???) 8067 with tempfile.TemporaryDirectory() as tmpdir: 8068 df_parquet = os.path.join(tmpdir, "df.parquet") 8069 df.to_parquet(df_parquet) 8070 8071 # Update hgvs column 8072 update_variant_query = f""" 8073 UPDATE {table_variants} 8074 SET "{hgvs_column_name}"=df."{hgvs_column_name}" 8075 FROM read_parquet('{df_parquet}') as df 8076 WHERE variants."#CHROM" = df.CHROM 8077 AND variants.POS = df.POS 8078 AND variants.REF = df.REF 8079 AND variants.ALT = df.ALT 8080 AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL 8081 """ 8082 self.execute_query(update_variant_query) 8083 8084 # Update INFO column 8085 sql_query_update = f""" 8086 UPDATE {table_variants} 8087 SET INFO = 8088 concat( 8089 CASE 8090 WHEN INFO NOT IN ('','.') 8091 THEN concat(INFO, ';') 8092 ELSE '' 8093 END, 8094 'hgvs=', 8095 {hgvs_column_name} 8096 ) 8097 WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL 8098 """ 8099 self.execute_query(sql_query_update) 8100 8101 # Add header 8102 HGVS_INFOS = { 8103 "hgvs": { 8104 "ID": "hgvs", 8105 "Number": ".", 8106 "Type": "String", 8107 "Description": f"HGVS annotatation with HOWARD", 8108 } 8109 } 8110 8111 for field in HGVS_INFOS: 8112 field_ID = HGVS_INFOS[field]["ID"] 8113 field_description = HGVS_INFOS[field]["Description"] 8114 self.get_header().infos[field_ID] = vcf.parser._Info( 8115 field_ID, 8116 HGVS_INFOS[field]["Number"], 8117 HGVS_INFOS[field]["Type"], 8118 field_description, 8119 "unknown", 8120 "unknown", 8121 code_type_map[HGVS_INFOS[field]["Type"]], 8122 ) 8123 8124 # Remove added columns 8125 for added_column in added_columns: 8126 self.drop_column(column=added_column)
The annotation_hgvs function performs HGVS annotation on a set of variants using genomic
coordinates and alleles.
Parameters
- threads: The
threadsparameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from theget_threads()method
8132 def get_operations_help( 8133 self, operations_config_dict: dict = {}, operations_config_file: str = None 8134 ) -> list: 8135 8136 # Init 8137 operations_help = [] 8138 8139 # operations 8140 operations = self.get_config_json( 8141 name="calculations", 8142 config_dict=operations_config_dict, 8143 config_file=operations_config_file, 8144 ) 8145 for op in operations: 8146 op_name = operations[op].get("name", op).upper() 8147 op_description = operations[op].get("description", op_name) 8148 op_available = operations[op].get("available", False) 8149 if op_available: 8150 operations_help.append(f" {op_name}: {op_description}") 8151 8152 # Sort operations 8153 operations_help.sort() 8154 8155 # insert header 8156 operations_help.insert(0, "Available calculation operations:") 8157 8158 # Return 8159 return operations_help
8161 def calculation( 8162 self, 8163 operations: dict = {}, 8164 operations_config_dict: dict = {}, 8165 operations_config_file: str = None, 8166 ) -> None: 8167 """ 8168 It takes a list of operations, and for each operation, it checks if it's a python or sql 8169 operation, and then calls the appropriate function 8170 8171 param json example: 8172 "calculation": { 8173 "NOMEN": { 8174 "options": { 8175 "hgvs_field": "hgvs" 8176 }, 8177 "middle" : null 8178 } 8179 """ 8180 8181 # Param 8182 param = self.get_param() 8183 8184 # operations config 8185 operations_config = self.get_config_json( 8186 name="calculations", 8187 config_dict=operations_config_dict, 8188 config_file=operations_config_file, 8189 ) 8190 8191 # Upper keys 8192 operations_config = {k.upper(): v for k, v in operations_config.items()} 8193 8194 # Calculations 8195 8196 # Operations from param 8197 operations = param.get("calculation", {}).get("calculations", operations) 8198 8199 # Quick calculation - add 8200 if param.get("calculations", None): 8201 8202 # List of operations 8203 calculations_list = [ 8204 value.strip() for value in param.get("calculations", "").split(",") 8205 ] 8206 8207 # Log 8208 log.info(f"Quick Calculations:") 8209 for calculation_key in calculations_list: 8210 log.info(f" {calculation_key}") 8211 8212 # Create tmp operations (to keep operation order) 8213 operations_tmp = {} 8214 for calculation_operation in calculations_list: 8215 if calculation_operation.upper() not in operations_tmp: 8216 log.debug( 8217 f"{calculation_operation}.upper() not in {operations_tmp}" 8218 ) 8219 operations_tmp[calculation_operation.upper()] = {} 8220 add_value_into_dict( 8221 dict_tree=operations_tmp, 8222 sections=[ 8223 calculation_operation.upper(), 8224 ], 8225 value=operations.get(calculation_operation.upper(), {}), 8226 ) 8227 # Add operations already in param 8228 for calculation_operation in operations: 8229 if calculation_operation not in operations_tmp: 8230 operations_tmp[calculation_operation] = operations.get( 8231 calculation_operation, {} 8232 ) 8233 8234 # Update operations in param 8235 operations = operations_tmp 8236 8237 # Operations for calculation 8238 if not operations: 8239 operations = param.get("calculation", {}).get("calculations", {}) 8240 8241 if operations: 8242 log.info(f"Calculations...") 8243 8244 # For each operations 8245 for operation_name in operations: 8246 operation_name = operation_name.upper() 8247 if operation_name not in [""]: 8248 if operation_name in operations_config: 8249 log.info(f"Calculation '{operation_name}'") 8250 operation = operations_config[operation_name] 8251 operation_type = operation.get("type", "sql") 8252 if operation_type == "python": 8253 self.calculation_process_function( 8254 operation=operation, operation_name=operation_name 8255 ) 8256 elif operation_type == "sql": 8257 self.calculation_process_sql( 8258 operation=operation, operation_name=operation_name 8259 ) 8260 else: 8261 log.error( 8262 f"Operations config: Type '{operation_type}' NOT available" 8263 ) 8264 raise ValueError( 8265 f"Operations config: Type '{operation_type}' NOT available" 8266 ) 8267 else: 8268 log.error( 8269 f"Operations config: Calculation '{operation_name}' NOT available" 8270 ) 8271 raise ValueError( 8272 f"Operations config: Calculation '{operation_name}' NOT available" 8273 ) 8274 8275 # Explode INFOS fields into table fields 8276 if self.get_explode_infos(): 8277 self.explode_infos( 8278 prefix=self.get_explode_infos_prefix(), 8279 fields=self.get_explode_infos_fields(), 8280 force=True, 8281 )
It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function
param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }
8283 def calculation_process_sql( 8284 self, operation: dict, operation_name: str = "unknown" 8285 ) -> None: 8286 """ 8287 The `calculation_process_sql` function takes in a mathematical operation as a string and 8288 performs the operation, updating the specified table with the result. 8289 8290 :param operation: The `operation` parameter is a dictionary that contains information about the 8291 mathematical operation to be performed. It includes the following keys: 8292 :type operation: dict 8293 :param operation_name: The `operation_name` parameter is a string that represents the name of 8294 the mathematical operation being performed. It is used for logging and error handling purposes, 8295 defaults to unknown 8296 :type operation_name: str (optional) 8297 """ 8298 8299 # table variants 8300 table_variants = self.get_table_variants(clause="alter") 8301 8302 # Operation infos 8303 operation_name = operation.get("name", "unknown") 8304 log.debug(f"process sql {operation_name}") 8305 output_column_name = operation.get("output_column_name", operation_name) 8306 output_column_type = operation.get("output_column_type", "String") 8307 prefix = operation.get("explode_infos_prefix", "") 8308 output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR") 8309 output_column_description = operation.get( 8310 "output_column_description", f"{operation_name} operation" 8311 ) 8312 operation_query = operation.get("operation_query", None) 8313 if isinstance(operation_query, list): 8314 operation_query = " ".join(operation_query) 8315 operation_info_fields = operation.get("info_fields", []) 8316 operation_info_fields_check = operation.get("info_fields_check", False) 8317 operation_info = operation.get("operation_info", True) 8318 8319 if operation_query: 8320 8321 # Info fields check 8322 operation_info_fields_check_result = True 8323 if operation_info_fields_check: 8324 header_infos = self.get_header().infos 8325 for info_field in operation_info_fields: 8326 operation_info_fields_check_result = ( 8327 operation_info_fields_check_result 8328 and info_field in header_infos 8329 ) 8330 8331 # If info fields available 8332 if operation_info_fields_check_result: 8333 8334 # Added_columns 8335 added_columns = [] 8336 8337 # Create VCF header field 8338 vcf_reader = self.get_header() 8339 vcf_reader.infos[output_column_name] = vcf.parser._Info( 8340 output_column_name, 8341 ".", 8342 output_column_type, 8343 output_column_description, 8344 "howard calculation", 8345 "0", 8346 self.code_type_map.get(output_column_type), 8347 ) 8348 8349 # Explode infos if needed 8350 log.debug(f"calculation_process_sql prefix {prefix}") 8351 added_columns += self.explode_infos( 8352 prefix=prefix, 8353 fields=[output_column_name] + operation_info_fields, 8354 force=True, 8355 ) 8356 8357 # Create column 8358 added_column = self.add_column( 8359 table_name=table_variants, 8360 column_name=prefix + output_column_name, 8361 column_type=output_column_type_sql, 8362 default_value="null", 8363 ) 8364 added_columns.append(added_column) 8365 8366 # Operation calculation 8367 try: 8368 8369 # Query to update calculation column 8370 sql_update = f""" 8371 UPDATE {table_variants} 8372 SET "{prefix}{output_column_name}" = ({operation_query}) 8373 """ 8374 self.conn.execute(sql_update) 8375 8376 # Add to INFO 8377 if operation_info: 8378 sql_update_info = f""" 8379 UPDATE {table_variants} 8380 SET "INFO" = 8381 concat( 8382 CASE 8383 WHEN "INFO" IS NOT NULL 8384 THEN concat("INFO", ';') 8385 ELSE '' 8386 END, 8387 '{output_column_name}=', 8388 "{prefix}{output_column_name}" 8389 ) 8390 WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('') 8391 """ 8392 self.conn.execute(sql_update_info) 8393 8394 except: 8395 log.error( 8396 f"Operations config: Calculation '{operation_name}' query failed" 8397 ) 8398 raise ValueError( 8399 f"Operations config: Calculation '{operation_name}' query failed" 8400 ) 8401 8402 # Remove added columns 8403 for added_column in added_columns: 8404 log.debug(f"added_column: {added_column}") 8405 self.drop_column(column=added_column) 8406 8407 else: 8408 log.error( 8409 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8410 ) 8411 raise ValueError( 8412 f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}" 8413 ) 8414 8415 else: 8416 log.error( 8417 f"Operations config: Calculation '{operation_name}' query NOT defined" 8418 ) 8419 raise ValueError( 8420 f"Operations config: Calculation '{operation_name}' query NOT defined" 8421 )
The calculation_process_sql function takes in a mathematical operation as a string and
performs the operation, updating the specified table with the result.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
8423 def calculation_process_function( 8424 self, operation: dict, operation_name: str = "unknown" 8425 ) -> None: 8426 """ 8427 The `calculation_process_function` takes in an operation dictionary and performs the specified 8428 function with the given parameters. 8429 8430 :param operation: The `operation` parameter is a dictionary that contains information about the 8431 operation to be performed. It has the following keys: 8432 :type operation: dict 8433 :param operation_name: The `operation_name` parameter is a string that represents the name of 8434 the operation being performed. It is used for logging purposes, defaults to unknown 8435 :type operation_name: str (optional) 8436 """ 8437 8438 operation_name = operation["name"] 8439 log.debug(f"process sql {operation_name}") 8440 function_name = operation["function_name"] 8441 function_params = operation["function_params"] 8442 getattr(self, function_name)(*function_params)
The calculation_process_function takes in an operation dictionary and performs the specified
function with the given parameters.
Parameters
- operation: The
operationparameter is a dictionary that contains information about the operation to be performed. It has the following keys: - operation_name: The
operation_nameparameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
8444 def calculation_variant_id(self) -> None: 8445 """ 8446 The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and 8447 updates the INFO field of a variants table with the variant ID. 8448 """ 8449 8450 # variant_id annotation field 8451 variant_id_tag = self.get_variant_id_column() 8452 added_columns = [variant_id_tag] 8453 8454 # variant_id hgvs tags" 8455 vcf_infos_tags = { 8456 variant_id_tag: "howard variant ID annotation", 8457 } 8458 8459 # Variants table 8460 table_variants = self.get_table_variants() 8461 8462 # Header 8463 vcf_reader = self.get_header() 8464 8465 # Add variant_id to header 8466 vcf_reader.infos[variant_id_tag] = vcf.parser._Info( 8467 variant_id_tag, 8468 ".", 8469 "String", 8470 vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"), 8471 "howard calculation", 8472 "0", 8473 self.code_type_map.get("String"), 8474 ) 8475 8476 # Update 8477 sql_update = f""" 8478 UPDATE {table_variants} 8479 SET "INFO" = 8480 concat( 8481 CASE 8482 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8483 THEN '' 8484 ELSE concat("INFO", ';') 8485 END, 8486 '{variant_id_tag}=', 8487 "{variant_id_tag}" 8488 ) 8489 """ 8490 self.conn.execute(sql_update) 8491 8492 # Remove added columns 8493 for added_column in added_columns: 8494 self.drop_column(column=added_column)
The function calculation_variant_id adds a variant ID annotation to a VCF file header and
updates the INFO field of a variants table with the variant ID.
8496 def calculation_extract_snpeff_hgvs( 8497 self, 8498 snpeff_hgvs: str = "snpeff_hgvs", 8499 snpeff_field: str = "ANN", 8500 ) -> None: 8501 """ 8502 The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff 8503 annotation field in a VCF file and adds them as a new column in the variants table. 8504 8505 :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs` 8506 function is used to specify the name of the column that will store the HGVS nomenclatures 8507 extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to 8508 snpeff_hgvs 8509 :type snpeff_hgvs: str (optional) 8510 :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs` 8511 function represents the field in the VCF file that contains SnpEff annotations. This field is 8512 used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults 8513 to ANN 8514 :type snpeff_field: str (optional) 8515 """ 8516 8517 # Snpeff hgvs tags 8518 vcf_infos_tags = { 8519 snpeff_hgvs: "HGVS nomenclatures from snpEff annotation", 8520 } 8521 8522 # Prefix 8523 prefix = self.get_explode_infos_prefix() 8524 if prefix: 8525 prefix = "INFO/" 8526 8527 # snpEff fields 8528 speff_ann_infos = prefix + snpeff_field 8529 speff_hgvs_infos = prefix + snpeff_hgvs 8530 8531 # Variants table 8532 table_variants = self.get_table_variants() 8533 8534 # Header 8535 vcf_reader = self.get_header() 8536 8537 # Add columns 8538 added_columns = [] 8539 8540 # Explode HGVS field in column 8541 added_columns += self.explode_infos(fields=[snpeff_field]) 8542 8543 if snpeff_field in vcf_reader.infos: 8544 8545 log.debug(vcf_reader.infos[snpeff_field]) 8546 8547 # Extract ANN header 8548 ann_description = vcf_reader.infos[snpeff_field].desc 8549 pattern = r"'(.+?)'" 8550 match = re.search(pattern, ann_description) 8551 if match: 8552 ann_header_match = match.group(1).split(" | ") 8553 ann_header_desc = {} 8554 for i in range(len(ann_header_match)): 8555 ann_header_info = "".join( 8556 char for char in ann_header_match[i] if char.isalnum() 8557 ) 8558 ann_header_desc[ann_header_info] = ann_header_match[i] 8559 if not ann_header_desc: 8560 raise ValueError("Invalid header description format") 8561 else: 8562 raise ValueError("Invalid header description format") 8563 8564 # Create variant id 8565 variant_id_column = self.get_variant_id_column() 8566 added_columns += [variant_id_column] 8567 8568 # Create dataframe 8569 dataframe_snpeff_hgvs = self.get_query_to_df( 8570 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8571 ) 8572 8573 # Create main NOMEN column 8574 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8575 speff_ann_infos 8576 ].apply( 8577 lambda x: extract_snpeff_hgvs( 8578 str(x), header=list(ann_header_desc.values()) 8579 ) 8580 ) 8581 8582 # Add snpeff_hgvs to header 8583 vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info( 8584 snpeff_hgvs, 8585 ".", 8586 "String", 8587 vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"), 8588 "howard calculation", 8589 "0", 8590 self.code_type_map.get("String"), 8591 ) 8592 8593 # Update 8594 sql_update = f""" 8595 UPDATE variants 8596 SET "INFO" = 8597 concat( 8598 CASE 8599 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8600 THEN '' 8601 ELSE concat("INFO", ';') 8602 END, 8603 CASE 8604 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8605 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8606 THEN concat( 8607 '{snpeff_hgvs}=', 8608 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8609 ) 8610 ELSE '' 8611 END 8612 ) 8613 FROM dataframe_snpeff_hgvs 8614 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8615 8616 """ 8617 self.conn.execute(sql_update) 8618 8619 # Delete dataframe 8620 del dataframe_snpeff_hgvs 8621 gc.collect() 8622 8623 else: 8624 8625 log.warning( 8626 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8627 ) 8628 8629 # Remove added columns 8630 for added_column in added_columns: 8631 self.drop_column(column=added_column)
The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff
annotation field in a VCF file and adds them as a new column in the variants table.
Parameters
- snpeff_hgvs: The
snpeff_hgvsparameter in thecalculation_extract_snpeff_hgvsfunction is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs - snpeff_field: The
snpeff_fieldparameter in thecalculation_extract_snpeff_hgvsfunction represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
8633 def calculation_snpeff_ann_explode( 8634 self, 8635 uniquify: bool = True, 8636 output_format: str = "fields", 8637 output_prefix: str = "snpeff_", 8638 snpeff_field: str = "ANN", 8639 ) -> None: 8640 """ 8641 The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by 8642 exploding the HGVS field and updating variant information accordingly. 8643 8644 :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a 8645 boolean flag that determines whether the output should be uniquified or not. When set to `True`, 8646 it indicates that the output should be unique, meaning that duplicate entries should be removed, 8647 defaults to True 8648 :type uniquify: bool (optional) 8649 :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode` 8650 function specifies the format in which the output annotations will be generated. It has a 8651 default value of "fields". You can also set it to "JSON" to output the annotations in JSON 8652 format, defaults to fields 8653 :type output_format: str (optional) 8654 :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode` 8655 method is used to specify the prefix that will be added to the output annotations generated 8656 during the calculation process. This prefix helps to differentiate the newly added annotations 8657 from existing ones in the output data. By default, the, defaults to ANN_ 8658 :type output_prefix: str (optional) 8659 :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode` 8660 function is used to specify the field in the VCF file that contains SnpEff annotations. This 8661 field will be processed to explode the HGVS annotations and update the variant information 8662 accordingly, defaults to ANN 8663 :type snpeff_field: str (optional) 8664 """ 8665 8666 # SnpEff annotation field 8667 snpeff_hgvs = "snpeff_ann_explode" 8668 8669 # Snpeff hgvs tags 8670 vcf_infos_tags = { 8671 snpeff_hgvs: "Explode snpEff annotations", 8672 } 8673 8674 # Prefix 8675 prefix = self.get_explode_infos_prefix() 8676 if prefix: 8677 prefix = "INFO/" 8678 8679 # snpEff fields 8680 speff_ann_infos = prefix + snpeff_field 8681 speff_hgvs_infos = prefix + snpeff_hgvs 8682 8683 # Variants table 8684 table_variants = self.get_table_variants() 8685 8686 # Header 8687 vcf_reader = self.get_header() 8688 8689 # Add columns 8690 added_columns = [] 8691 8692 # Explode HGVS field in column 8693 added_columns += self.explode_infos(fields=[snpeff_field]) 8694 log.debug(f"snpeff_field={snpeff_field}") 8695 log.debug(f"added_columns={added_columns}") 8696 8697 if snpeff_field in vcf_reader.infos: 8698 8699 # Extract ANN header 8700 ann_description = vcf_reader.infos[snpeff_field].desc 8701 pattern = r"'(.+?)'" 8702 match = re.search(pattern, ann_description) 8703 if match: 8704 ann_header_match = match.group(1).split(" | ") 8705 ann_header = [] 8706 ann_header_desc = {} 8707 for i in range(len(ann_header_match)): 8708 ann_header_info = "".join( 8709 char for char in ann_header_match[i] if char.isalnum() 8710 ) 8711 ann_header.append(ann_header_info) 8712 ann_header_desc[ann_header_info] = ann_header_match[i] 8713 if not ann_header_desc: 8714 raise ValueError("Invalid header description format") 8715 else: 8716 raise ValueError("Invalid header description format") 8717 8718 # Create variant id 8719 variant_id_column = self.get_variant_id_column() 8720 added_columns += [variant_id_column] 8721 8722 # Create dataframe 8723 dataframe_snpeff_hgvs = self.get_query_to_df( 8724 f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """ 8725 ) 8726 8727 # Create snpEff columns 8728 dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[ 8729 speff_ann_infos 8730 ].apply( 8731 lambda x: explode_snpeff_ann( 8732 str(x), 8733 uniquify=uniquify, 8734 output_format=output_format, 8735 prefix=output_prefix, 8736 header=list(ann_header_desc.values()), 8737 ) 8738 ) 8739 8740 # Header 8741 ann_annotations_prefix = "" 8742 if output_format.upper() in ["JSON"]: 8743 ann_annotations_prefix = f"{output_prefix}=" 8744 vcf_reader.infos[output_prefix] = vcf.parser._Info( 8745 output_prefix, 8746 ".", 8747 "String", 8748 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8749 + " - JSON format", 8750 "howard calculation", 8751 "0", 8752 self.code_type_map.get("String"), 8753 ) 8754 else: 8755 for ann_annotation in ann_header: 8756 ann_annotation_id = f"{output_prefix}{ann_annotation}" 8757 vcf_reader.infos[ann_annotation_id] = vcf.parser._Info( 8758 ann_annotation_id, 8759 ".", 8760 "String", 8761 vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations") 8762 + f" - '{ann_header_desc[ann_annotation]}' annotation", 8763 "howard calculation", 8764 "0", 8765 self.code_type_map.get("String"), 8766 ) 8767 8768 # Update 8769 sql_update = f""" 8770 UPDATE variants 8771 SET "INFO" = 8772 concat( 8773 CASE 8774 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 8775 THEN '' 8776 ELSE concat("INFO", ';') 8777 END, 8778 CASE 8779 WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN') 8780 AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL 8781 THEN concat( 8782 '{ann_annotations_prefix}', 8783 dataframe_snpeff_hgvs."{speff_hgvs_infos}" 8784 ) 8785 ELSE '' 8786 END 8787 ) 8788 FROM dataframe_snpeff_hgvs 8789 WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}" 8790 8791 """ 8792 self.conn.execute(sql_update) 8793 8794 # Delete dataframe 8795 del dataframe_snpeff_hgvs 8796 gc.collect() 8797 8798 else: 8799 8800 log.warning( 8801 "No snpEff annotation. Please Anotate with snpEff before use this calculation option" 8802 ) 8803 8804 # Remove added columns 8805 for added_column in added_columns: 8806 self.drop_column(column=added_column)
The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by
exploding the HGVS field and updating variant information accordingly.
Parameters
- uniquify: The
uniquifyparameter in thecalculation_snpeff_ann_explodemethod is a boolean flag that determines whether the output should be uniquified or not. When set toTrue, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True - output_format: The
output_formatparameter in thecalculation_snpeff_ann_explodefunction specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields - output_prefix: The
output_prefixparameter in thecalculation_snpeff_ann_explodemethod is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_ - snpeff_field: The
snpeff_fieldparameter in thecalculation_snpeff_ann_explodefunction is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
8808 def calculation_extract_nomen(self) -> None: 8809 """ 8810 This function extracts the HGVS nomenclature from the calculation/identification of NOMEN. 8811 """ 8812 8813 # NOMEN field 8814 field_nomen_dict = "NOMEN_DICT" 8815 8816 # NOMEN structure 8817 nomen_dict = { 8818 "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)", 8819 "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)", 8820 "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)", 8821 "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant", 8822 "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)", 8823 "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)", 8824 "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)", 8825 "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)", 8826 "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)", 8827 "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)", 8828 } 8829 8830 # Param 8831 param = self.get_param() 8832 8833 # Prefix 8834 prefix = self.get_explode_infos_prefix() 8835 8836 # Header 8837 vcf_reader = self.get_header() 8838 8839 # Added columns 8840 added_columns = [] 8841 8842 # Get HGVS field 8843 hgvs_field = ( 8844 param.get("calculation", {}) 8845 .get("calculations", {}) 8846 .get("NOMEN", {}) 8847 .get("options", {}) 8848 .get("hgvs_field", "hgvs") 8849 ) 8850 8851 # Get NOMEN pattern 8852 nomen_pattern = ( 8853 param.get("calculation", {}) 8854 .get("calculations", {}) 8855 .get("NOMEN", {}) 8856 .get("options", {}) 8857 .get("pattern", None) 8858 ) 8859 8860 # transcripts list of preference sources 8861 transcripts_sources = {} 8862 8863 # Get transcripts 8864 transcripts_file = ( 8865 param.get("calculation", {}) 8866 .get("calculations", {}) 8867 .get("NOMEN", {}) 8868 .get("options", {}) 8869 .get("transcripts", None) 8870 ) 8871 transcripts_file = full_path(transcripts_file) 8872 if transcripts_file: 8873 if os.path.exists(transcripts_file): 8874 transcripts_dataframe = transcripts_file_to_df(transcripts_file) 8875 transcripts_from_file = transcripts_dataframe.iloc[:, 0].tolist() 8876 transcripts_sources["file"] = transcripts_from_file 8877 else: 8878 msg_err = f"Transcript file '{transcripts_file}' does NOT exist" 8879 log.error(msg_err) 8880 raise ValueError(msg_err) 8881 8882 # Get transcripts table 8883 transcripts_table = ( 8884 param.get("calculation", {}) 8885 .get("calculations", {}) 8886 .get("NOMEN", {}) 8887 .get("options", {}) 8888 .get("transcripts_table", self.get_table_variants()) 8889 ) 8890 # Get transcripts column 8891 transcripts_column = ( 8892 param.get("calculation", {}) 8893 .get("calculations", {}) 8894 .get("NOMEN", {}) 8895 .get("options", {}) 8896 .get("transcripts_column", None) 8897 ) 8898 8899 if transcripts_table and transcripts_column: 8900 extra_field_transcript = f"{transcripts_table}.{transcripts_column}" 8901 # Explode if not exists 8902 self.explode_infos(fields=[transcripts_column], table=transcripts_table) 8903 else: 8904 extra_field_transcript = f"NULL" 8905 8906 # Transcripts of preference source order 8907 transcripts_order = ( 8908 param.get("calculation", {}) 8909 .get("calculations", {}) 8910 .get("NOMEN", {}) 8911 .get("options", {}) 8912 .get("transcripts_order", ["column", "file"]) 8913 ) 8914 8915 # Transcripts from file 8916 transcripts = transcripts_sources.get("file", []) 8917 8918 # Explode HGVS field in column 8919 added_columns += self.explode_infos(fields=[hgvs_field]) 8920 8921 # extra infos 8922 extra_infos = self.get_extra_infos() 8923 extra_field = prefix + hgvs_field 8924 8925 if extra_field in extra_infos: 8926 8927 # Create dataframe 8928 dataframe_hgvs = self.get_query_to_df( 8929 f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" AS hgvs, {extra_field_transcript} AS transcript FROM variants """ 8930 ) 8931 8932 # Create main NOMEN column 8933 dataframe_hgvs[field_nomen_dict] = dataframe_hgvs.apply( 8934 lambda x: find_nomen( 8935 hgvs=x.hgvs, 8936 transcript=x.transcript, 8937 transcripts=transcripts, 8938 pattern=nomen_pattern, 8939 transcripts_source_order=transcripts_order, 8940 ), 8941 axis=1, 8942 ) 8943 8944 # Explode NOMEN Structure and create SQL set for update 8945 sql_nomen_fields = [] 8946 for nomen_field in nomen_dict: 8947 8948 # Explode each field into a column 8949 dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply( 8950 lambda x: dict(x).get(nomen_field, "") 8951 ) 8952 8953 # Create VCF header field 8954 vcf_reader.infos[nomen_field] = vcf.parser._Info( 8955 nomen_field, 8956 ".", 8957 "String", 8958 nomen_dict.get(nomen_field, "howard calculation NOMEN"), 8959 "howard calculation", 8960 "0", 8961 self.code_type_map.get("String"), 8962 ) 8963 sql_nomen_fields.append( 8964 f""" 8965 CASE 8966 WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('') 8967 THEN concat( 8968 ';{nomen_field}=', 8969 dataframe_hgvs."{nomen_field}" 8970 ) 8971 ELSE '' 8972 END 8973 """ 8974 ) 8975 8976 # SQL set for update 8977 sql_nomen_fields_set = ", ".join(sql_nomen_fields) 8978 8979 # Update 8980 sql_update = f""" 8981 UPDATE variants 8982 SET "INFO" = 8983 concat( 8984 CASE 8985 WHEN "INFO" IS NULL 8986 THEN '' 8987 ELSE "INFO" 8988 END, 8989 {sql_nomen_fields_set} 8990 ) 8991 FROM dataframe_hgvs 8992 WHERE variants."#CHROM" = dataframe_hgvs."#CHROM" 8993 AND variants."POS" = dataframe_hgvs."POS" 8994 AND variants."REF" = dataframe_hgvs."REF" 8995 AND variants."ALT" = dataframe_hgvs."ALT" 8996 """ 8997 self.conn.execute(sql_update) 8998 8999 # Delete dataframe 9000 del dataframe_hgvs 9001 gc.collect() 9002 9003 # Remove added columns 9004 for added_column in added_columns: 9005 self.drop_column(column=added_column)
This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
9007 def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None: 9008 """ 9009 The function `calculation_find_by_pipeline` performs a calculation to find the number of 9010 pipeline/sample for a variant and updates the variant information in a VCF file. 9011 9012 :param tag: The `tag` parameter is a string that represents the annotation field for the 9013 "findbypipeline" information in the VCF file. It is used to create the annotation field in the 9014 VCF header and to update the corresponding field in the variants table, defaults to 9015 findbypipeline 9016 :type tag: str (optional) 9017 """ 9018 9019 # if FORMAT and samples 9020 if ( 9021 "FORMAT" in self.get_header_columns_as_list() 9022 and self.get_header_sample_list() 9023 ): 9024 9025 # findbypipeline annotation field 9026 findbypipeline_tag = tag 9027 9028 # VCF infos tags 9029 vcf_infos_tags = { 9030 findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})", 9031 } 9032 9033 # Prefix 9034 prefix = self.get_explode_infos_prefix() 9035 9036 # Field 9037 findbypipeline_infos = prefix + findbypipeline_tag 9038 9039 # Variants table 9040 table_variants = self.get_table_variants() 9041 9042 # Header 9043 vcf_reader = self.get_header() 9044 9045 # Create variant id 9046 variant_id_column = self.get_variant_id_column() 9047 added_columns = [variant_id_column] 9048 9049 # variant_id, FORMAT and samples 9050 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9051 self.get_header_sample_list() 9052 ) 9053 9054 # Create dataframe 9055 dataframe_findbypipeline = self.get_query_to_df( 9056 f""" SELECT {samples_fields} FROM {table_variants} """ 9057 ) 9058 9059 # Create findbypipeline column 9060 dataframe_findbypipeline[findbypipeline_infos] = ( 9061 dataframe_findbypipeline.apply( 9062 lambda row: findbypipeline( 9063 row, samples=self.get_header_sample_list() 9064 ), 9065 axis=1, 9066 ) 9067 ) 9068 9069 # Add snpeff_hgvs to header 9070 vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info( 9071 findbypipeline_tag, 9072 ".", 9073 "String", 9074 vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"), 9075 "howard calculation", 9076 "0", 9077 self.code_type_map.get("String"), 9078 ) 9079 9080 # Update 9081 sql_update = f""" 9082 UPDATE variants 9083 SET "INFO" = 9084 concat( 9085 CASE 9086 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9087 THEN '' 9088 ELSE concat("INFO", ';') 9089 END, 9090 CASE 9091 WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.') 9092 AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL 9093 THEN concat( 9094 '{findbypipeline_tag}=', 9095 dataframe_findbypipeline."{findbypipeline_infos}" 9096 ) 9097 ELSE '' 9098 END 9099 ) 9100 FROM dataframe_findbypipeline 9101 WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}" 9102 """ 9103 self.conn.execute(sql_update) 9104 9105 # Remove added columns 9106 for added_column in added_columns: 9107 self.drop_column(column=added_column) 9108 9109 # Delete dataframe 9110 del dataframe_findbypipeline 9111 gc.collect()
The function calculation_find_by_pipeline performs a calculation to find the number of
pipeline/sample for a variant and updates the variant information in a VCF file.
Parameters
- tag: The
tagparameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
9113 def calculation_genotype_concordance(self) -> None: 9114 """ 9115 The function `calculation_genotype_concordance` calculates the genotype concordance for 9116 multi-caller VCF files and updates the variant information in the database. 9117 """ 9118 9119 # if FORMAT and samples 9120 if ( 9121 "FORMAT" in self.get_header_columns_as_list() 9122 and self.get_header_sample_list() 9123 ): 9124 9125 # genotypeconcordance annotation field 9126 genotypeconcordance_tag = "genotypeconcordance" 9127 9128 # VCF infos tags 9129 vcf_infos_tags = { 9130 genotypeconcordance_tag: "Concordance of genotype for multi caller VCF", 9131 } 9132 9133 # Prefix 9134 prefix = self.get_explode_infos_prefix() 9135 9136 # Field 9137 genotypeconcordance_infos = prefix + genotypeconcordance_tag 9138 9139 # Variants table 9140 table_variants = self.get_table_variants() 9141 9142 # Header 9143 vcf_reader = self.get_header() 9144 9145 # Create variant id 9146 variant_id_column = self.get_variant_id_column() 9147 added_columns = [variant_id_column] 9148 9149 # variant_id, FORMAT and samples 9150 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9151 self.get_header_sample_list() 9152 ) 9153 9154 # Create dataframe 9155 dataframe_genotypeconcordance = self.get_query_to_df( 9156 f""" SELECT {samples_fields} FROM {table_variants} """ 9157 ) 9158 9159 # Create genotypeconcordance column 9160 dataframe_genotypeconcordance[genotypeconcordance_infos] = ( 9161 dataframe_genotypeconcordance.apply( 9162 lambda row: genotypeconcordance( 9163 row, samples=self.get_header_sample_list() 9164 ), 9165 axis=1, 9166 ) 9167 ) 9168 9169 # Add genotypeconcordance to header 9170 vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info( 9171 genotypeconcordance_tag, 9172 ".", 9173 "String", 9174 vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"), 9175 "howard calculation", 9176 "0", 9177 self.code_type_map.get("String"), 9178 ) 9179 9180 # Update 9181 sql_update = f""" 9182 UPDATE variants 9183 SET "INFO" = 9184 concat( 9185 CASE 9186 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9187 THEN '' 9188 ELSE concat("INFO", ';') 9189 END, 9190 CASE 9191 WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.') 9192 AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL 9193 THEN concat( 9194 '{genotypeconcordance_tag}=', 9195 dataframe_genotypeconcordance."{genotypeconcordance_infos}" 9196 ) 9197 ELSE '' 9198 END 9199 ) 9200 FROM dataframe_genotypeconcordance 9201 WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}" 9202 """ 9203 self.conn.execute(sql_update) 9204 9205 # Remove added columns 9206 for added_column in added_columns: 9207 self.drop_column(column=added_column) 9208 9209 # Delete dataframe 9210 del dataframe_genotypeconcordance 9211 gc.collect()
The function calculation_genotype_concordance calculates the genotype concordance for
multi-caller VCF files and updates the variant information in the database.
9213 def calculation_barcode(self, tag: str = "barcode") -> None: 9214 """ 9215 The `calculation_barcode` function calculates barcode values for variants in a VCF file and 9216 updates the INFO field in the file with the calculated barcode values. 9217 9218 :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag 9219 name that will be used for the barcode calculation in the VCF file. If no tag name is provided, 9220 the default tag name is set to "barcode", defaults to barcode 9221 :type tag: str (optional) 9222 """ 9223 9224 # if FORMAT and samples 9225 if ( 9226 "FORMAT" in self.get_header_columns_as_list() 9227 and self.get_header_sample_list() 9228 ): 9229 9230 # barcode annotation field 9231 if not tag: 9232 tag = "barcode" 9233 9234 # VCF infos tags 9235 vcf_infos_tags = { 9236 tag: "barcode calculation (VaRank)", 9237 } 9238 9239 # Prefix 9240 prefix = self.get_explode_infos_prefix() 9241 9242 # Field 9243 barcode_infos = prefix + tag 9244 9245 # Variants table 9246 table_variants = self.get_table_variants() 9247 9248 # Header 9249 vcf_reader = self.get_header() 9250 9251 # Create variant id 9252 variant_id_column = self.get_variant_id_column() 9253 added_columns = [variant_id_column] 9254 9255 # variant_id, FORMAT and samples 9256 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9257 self.get_header_sample_list() 9258 ) 9259 9260 # Create dataframe 9261 dataframe_barcode = self.get_query_to_df( 9262 f""" SELECT {samples_fields} FROM {table_variants} """ 9263 ) 9264 9265 # Create barcode column 9266 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9267 lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1 9268 ) 9269 9270 # Add barcode to header 9271 vcf_reader.infos[tag] = vcf.parser._Info( 9272 tag, 9273 ".", 9274 "String", 9275 vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)), 9276 "howard calculation", 9277 "0", 9278 self.code_type_map.get("String"), 9279 ) 9280 9281 # Update 9282 sql_update = f""" 9283 UPDATE {table_variants} 9284 SET "INFO" = 9285 concat( 9286 CASE 9287 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9288 THEN '' 9289 ELSE concat("INFO", ';') 9290 END, 9291 CASE 9292 WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.') 9293 AND dataframe_barcode."{barcode_infos}" NOT NULL 9294 THEN concat( 9295 '{tag}=', 9296 dataframe_barcode."{barcode_infos}" 9297 ) 9298 ELSE '' 9299 END 9300 ) 9301 FROM dataframe_barcode 9302 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9303 """ 9304 self.conn.execute(sql_update) 9305 9306 # Remove added columns 9307 for added_column in added_columns: 9308 self.drop_column(column=added_column) 9309 9310 # Delete dataframe 9311 del dataframe_barcode 9312 gc.collect()
The calculation_barcode function calculates barcode values for variants in a VCF file and
updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcodefunction is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
9314 def calculation_barcode_family(self, tag: str = "BCF") -> None: 9315 """ 9316 The `calculation_barcode_family` function calculates barcode values for variants in a VCF file 9317 and updates the INFO field in the file with the calculated barcode values. 9318 9319 :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify 9320 the barcode tag that will be added to the VCF file during the calculation process. If no value 9321 is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF 9322 :type tag: str (optional) 9323 """ 9324 9325 # if FORMAT and samples 9326 if ( 9327 "FORMAT" in self.get_header_columns_as_list() 9328 and self.get_header_sample_list() 9329 ): 9330 9331 # barcode annotation field 9332 if not tag: 9333 tag = "BCF" 9334 9335 # VCF infos tags 9336 vcf_infos_tags = { 9337 tag: "barcode family calculation", 9338 f"{tag}S": "barcode family samples", 9339 } 9340 9341 # Param 9342 param = self.get_param() 9343 log.debug(f"param={param}") 9344 9345 # Prefix 9346 prefix = self.get_explode_infos_prefix() 9347 9348 # PED param 9349 ped = ( 9350 param.get("calculation", {}) 9351 .get("calculations", {}) 9352 .get("BARCODEFAMILY", {}) 9353 .get("family_pedigree", None) 9354 ) 9355 log.debug(f"ped={ped}") 9356 9357 # Load PED 9358 if ped: 9359 9360 # Pedigree is a file 9361 if isinstance(ped, str) and os.path.exists(full_path(ped)): 9362 log.debug("Pedigree is file") 9363 with open(full_path(ped)) as ped: 9364 ped = json.load(ped) 9365 9366 # Pedigree is a string 9367 elif isinstance(ped, str): 9368 log.debug("Pedigree is str") 9369 try: 9370 ped = json.loads(ped) 9371 log.debug("Pedigree is json str") 9372 except ValueError as e: 9373 ped_samples = ped.split(",") 9374 ped = {} 9375 for ped_sample in ped_samples: 9376 ped[ped_sample] = ped_sample 9377 9378 # Pedigree is a dict 9379 elif isinstance(ped, dict): 9380 log.debug("Pedigree is dict") 9381 9382 # Pedigree is not well formatted 9383 else: 9384 msg_error = "Pedigree not well formatted" 9385 log.error(msg_error) 9386 raise ValueError(msg_error) 9387 9388 # Construct list 9389 ped_samples = list(ped.values()) 9390 9391 else: 9392 log.debug("Pedigree not defined. Take all samples") 9393 ped_samples = self.get_header_sample_list() 9394 ped = {} 9395 for ped_sample in ped_samples: 9396 ped[ped_sample] = ped_sample 9397 9398 # Check pedigree 9399 if not ped or len(ped) == 0: 9400 msg_error = f"Error in pedigree: samples {ped_samples}" 9401 log.error(msg_error) 9402 raise ValueError(msg_error) 9403 9404 # Log 9405 log.info( 9406 "Calculation 'BARCODEFAMILY' - Samples: " 9407 + ", ".join([f"{member}='{ped[member]}'" for member in ped]) 9408 ) 9409 log.debug(f"ped_samples={ped_samples}") 9410 9411 # Field 9412 barcode_infos = prefix + tag 9413 9414 # Variants table 9415 table_variants = self.get_table_variants() 9416 9417 # Header 9418 vcf_reader = self.get_header() 9419 9420 # Create variant id 9421 variant_id_column = self.get_variant_id_column() 9422 added_columns = [variant_id_column] 9423 9424 # variant_id, FORMAT and samples 9425 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9426 ped_samples 9427 ) 9428 9429 # Create dataframe 9430 dataframe_barcode = self.get_query_to_df( 9431 f""" SELECT {samples_fields} FROM {table_variants} """ 9432 ) 9433 9434 # Create barcode column 9435 dataframe_barcode[barcode_infos] = dataframe_barcode.apply( 9436 lambda row: barcode(row, samples=ped_samples), axis=1 9437 ) 9438 9439 # Add barcode family to header 9440 # Add vaf_normalization to header 9441 vcf_reader.formats[tag] = vcf.parser._Format( 9442 id=tag, 9443 num=".", 9444 type="String", 9445 desc=vcf_infos_tags.get(tag, "barcode family calculation"), 9446 type_code=self.code_type_map.get("String"), 9447 ) 9448 vcf_reader.formats[f"{tag}S"] = vcf.parser._Format( 9449 id=f"{tag}S", 9450 num=".", 9451 type="String", 9452 desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"), 9453 type_code=self.code_type_map.get("String"), 9454 ) 9455 9456 # Update 9457 # for sample in ped_samples: 9458 sql_update_set = [] 9459 for sample in self.get_header_sample_list() + ["FORMAT"]: 9460 if sample in ped_samples: 9461 value = f'dataframe_barcode."{barcode_infos}"' 9462 value_samples = "'" + ",".join(ped_samples) + "'" 9463 elif sample == "FORMAT": 9464 value = f"'{tag}'" 9465 value_samples = f"'{tag}S'" 9466 else: 9467 value = "'.'" 9468 value_samples = "'.'" 9469 format_regex = r"[a-zA-Z0-9\s]" 9470 sql_update_set.append( 9471 f""" 9472 "{sample}" = 9473 concat( 9474 CASE 9475 WHEN {table_variants}."{sample}" = './.' 9476 THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g')) 9477 ELSE {table_variants}."{sample}" 9478 END, 9479 ':', 9480 {value}, 9481 ':', 9482 {value_samples} 9483 ) 9484 """ 9485 ) 9486 9487 sql_update_set_join = ", ".join(sql_update_set) 9488 sql_update = f""" 9489 UPDATE {table_variants} 9490 SET {sql_update_set_join} 9491 FROM dataframe_barcode 9492 WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}" 9493 """ 9494 self.conn.execute(sql_update) 9495 9496 # Remove added columns 9497 for added_column in added_columns: 9498 self.drop_column(column=added_column) 9499 9500 # Delete dataframe 9501 del dataframe_barcode 9502 gc.collect()
The calculation_barcode_family function calculates barcode values for variants in a VCF file
and updates the INFO field in the file with the calculated barcode values.
Parameters
- tag: The
tagparameter in thecalculation_barcode_familyfunction is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for thetagparameter, the default value used is "BCF", defaults to BCF
9504 def calculation_trio(self) -> None: 9505 """ 9506 The `calculation_trio` function performs trio calculations on a VCF file by adding trio 9507 information to the INFO field of each variant. 9508 """ 9509 9510 # if FORMAT and samples 9511 if ( 9512 "FORMAT" in self.get_header_columns_as_list() 9513 and self.get_header_sample_list() 9514 ): 9515 9516 # trio annotation field 9517 trio_tag = "trio" 9518 9519 # VCF infos tags 9520 vcf_infos_tags = { 9521 "trio": "trio calculation", 9522 } 9523 9524 # Param 9525 param = self.get_param() 9526 9527 # Prefix 9528 prefix = self.get_explode_infos_prefix() 9529 9530 # Trio param 9531 trio_ped = ( 9532 param.get("calculation", {}) 9533 .get("calculations", {}) 9534 .get("TRIO", {}) 9535 .get("trio_pedigree", None) 9536 ) 9537 9538 # Load trio 9539 if trio_ped: 9540 9541 # Trio pedigree is a file 9542 if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)): 9543 log.debug("TRIO pedigree is file") 9544 with open(full_path(trio_ped)) as trio_ped: 9545 trio_ped = json.load(trio_ped) 9546 9547 # Trio pedigree is a string 9548 elif isinstance(trio_ped, str): 9549 log.debug("TRIO pedigree is str") 9550 try: 9551 trio_ped = json.loads(trio_ped) 9552 log.debug("TRIO pedigree is json str") 9553 except ValueError as e: 9554 trio_samples = trio_ped.split(",") 9555 if len(trio_samples) == 3: 9556 trio_ped = { 9557 "father": trio_samples[0], 9558 "mother": trio_samples[1], 9559 "child": trio_samples[2], 9560 } 9561 log.debug("TRIO pedigree is list str") 9562 else: 9563 msg_error = "TRIO pedigree not well formatted" 9564 log.error(msg_error) 9565 raise ValueError(msg_error) 9566 9567 # Trio pedigree is a dict 9568 elif isinstance(trio_ped, dict): 9569 log.debug("TRIO pedigree is dict") 9570 9571 # Trio pedigree is not well formatted 9572 else: 9573 msg_error = "TRIO pedigree not well formatted" 9574 log.error(msg_error) 9575 raise ValueError(msg_error) 9576 9577 # Construct trio list 9578 trio_samples = [ 9579 trio_ped.get("father", ""), 9580 trio_ped.get("mother", ""), 9581 trio_ped.get("child", ""), 9582 ] 9583 9584 else: 9585 log.debug("TRIO pedigree not defined. Take the first 3 samples") 9586 samples_list = self.get_header_sample_list() 9587 if len(samples_list) >= 3: 9588 trio_samples = self.get_header_sample_list()[0:3] 9589 trio_ped = { 9590 "father": trio_samples[0], 9591 "mother": trio_samples[1], 9592 "child": trio_samples[2], 9593 } 9594 else: 9595 msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}" 9596 log.error(msg_error) 9597 raise ValueError(msg_error) 9598 9599 # Check trio pedigree 9600 if not trio_ped or len(trio_ped) != 3: 9601 msg_error = f"Error in TRIO pedigree: {trio_ped}" 9602 log.error(msg_error) 9603 raise ValueError(msg_error) 9604 9605 # Log 9606 log.info( 9607 f"Calculation 'TRIO' - Samples: " 9608 + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped]) 9609 ) 9610 9611 # Field 9612 trio_infos = prefix + trio_tag 9613 9614 # Variants table 9615 table_variants = self.get_table_variants() 9616 9617 # Header 9618 vcf_reader = self.get_header() 9619 9620 # Create variant id 9621 variant_id_column = self.get_variant_id_column() 9622 added_columns = [variant_id_column] 9623 9624 # variant_id, FORMAT and samples 9625 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9626 self.get_header_sample_list() 9627 ) 9628 9629 # Create dataframe 9630 dataframe_trio = self.get_query_to_df( 9631 f""" SELECT {samples_fields} FROM {table_variants} """ 9632 ) 9633 9634 # Create trio column 9635 dataframe_trio[trio_infos] = dataframe_trio.apply( 9636 lambda row: trio(row, samples=trio_samples), axis=1 9637 ) 9638 9639 # Add trio to header 9640 vcf_reader.infos[trio_tag] = vcf.parser._Info( 9641 trio_tag, 9642 ".", 9643 "String", 9644 vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"), 9645 "howard calculation", 9646 "0", 9647 self.code_type_map.get("String"), 9648 ) 9649 9650 # Update 9651 sql_update = f""" 9652 UPDATE {table_variants} 9653 SET "INFO" = 9654 concat( 9655 CASE 9656 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9657 THEN '' 9658 ELSE concat("INFO", ';') 9659 END, 9660 CASE 9661 WHEN dataframe_trio."{trio_infos}" NOT IN ('','.') 9662 AND dataframe_trio."{trio_infos}" NOT NULL 9663 THEN concat( 9664 '{trio_tag}=', 9665 dataframe_trio."{trio_infos}" 9666 ) 9667 ELSE '' 9668 END 9669 ) 9670 FROM dataframe_trio 9671 WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}" 9672 """ 9673 self.conn.execute(sql_update) 9674 9675 # Remove added columns 9676 for added_column in added_columns: 9677 self.drop_column(column=added_column) 9678 9679 # Delete dataframe 9680 del dataframe_trio 9681 gc.collect()
The calculation_trio function performs trio calculations on a VCF file by adding trio
information to the INFO field of each variant.
9683 def calculation_vaf_normalization(self) -> None: 9684 """ 9685 The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency) 9686 normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly. 9687 :return: The function does not return anything. 9688 """ 9689 9690 # if FORMAT and samples 9691 if ( 9692 "FORMAT" in self.get_header_columns_as_list() 9693 and self.get_header_sample_list() 9694 ): 9695 9696 # vaf_normalization annotation field 9697 vaf_normalization_tag = "VAF" 9698 9699 # VCF infos tags 9700 vcf_infos_tags = { 9701 "VAF": "VAF Variant Frequency", 9702 } 9703 9704 # Prefix 9705 prefix = self.get_explode_infos_prefix() 9706 9707 # Variants table 9708 table_variants = self.get_table_variants() 9709 9710 # Header 9711 vcf_reader = self.get_header() 9712 9713 # Do not calculate if VAF already exists 9714 if "VAF" in vcf_reader.formats: 9715 log.debug("VAF already on genotypes") 9716 return 9717 9718 # Create variant id 9719 variant_id_column = self.get_variant_id_column() 9720 added_columns = [variant_id_column] 9721 9722 # variant_id, FORMAT and samples 9723 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9724 f""" "{sample}" """ for sample in self.get_header_sample_list() 9725 ) 9726 9727 # Create dataframe 9728 query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """ 9729 log.debug(f"query={query}") 9730 dataframe_vaf_normalization = self.get_query_to_df(query=query) 9731 9732 vaf_normalization_set = [] 9733 9734 # for each sample vaf_normalization 9735 for sample in self.get_header_sample_list(): 9736 dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply( 9737 lambda row: vaf_normalization(row, sample=sample), axis=1 9738 ) 9739 vaf_normalization_set.append( 9740 f""" "{sample}" = dataframe_vaf_normalization."{sample}" """ 9741 ) 9742 9743 # Add VAF to FORMAT 9744 dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[ 9745 "FORMAT" 9746 ].apply(lambda x: str(x) + ":VAF") 9747 vaf_normalization_set.append( 9748 f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """ 9749 ) 9750 9751 # Add vaf_normalization to header 9752 vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format( 9753 id=vaf_normalization_tag, 9754 num="1", 9755 type="Float", 9756 desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"), 9757 type_code=self.code_type_map.get("Float"), 9758 ) 9759 9760 # Create fields to add in INFO 9761 sql_vaf_normalization_set = " , ".join(vaf_normalization_set) 9762 9763 # Update 9764 sql_update = f""" 9765 UPDATE {table_variants} 9766 SET {sql_vaf_normalization_set} 9767 FROM dataframe_vaf_normalization 9768 WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}" 9769 9770 """ 9771 self.conn.execute(sql_update) 9772 9773 # Remove added columns 9774 for added_column in added_columns: 9775 self.drop_column(column=added_column) 9776 9777 # Delete dataframe 9778 del dataframe_vaf_normalization 9779 gc.collect()
The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency)
normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
Returns
The function does not return anything.
9781 def calculation_genotype_stats(self, info: str = "VAF") -> None: 9782 """ 9783 The `calculation_genotype_stats` function calculates genotype statistics for a given information 9784 field in a VCF file and updates the INFO column of the variants table with the calculated 9785 statistics. 9786 9787 :param info: The `info` parameter is a string that represents the type of information for which 9788 genotype statistics are calculated. It is used to generate various VCF info tags for the 9789 statistics, such as the number of occurrences, the list of values, the minimum value, the 9790 maximum value, the mean, the median, defaults to VAF 9791 :type info: str (optional) 9792 """ 9793 9794 # if FORMAT and samples 9795 if ( 9796 "FORMAT" in self.get_header_columns_as_list() 9797 and self.get_header_sample_list() 9798 ): 9799 9800 # vaf_stats annotation field 9801 vaf_stats_tag = info + "_stats" 9802 9803 # VCF infos tags 9804 vcf_infos_tags = { 9805 info + "_stats_nb": f"genotype {info} Statistics - number of {info}", 9806 info + "_stats_list": f"genotype {info} Statistics - list of {info}", 9807 info + "_stats_min": f"genotype {info} Statistics - min {info}", 9808 info + "_stats_max": f"genotype {info} Statistics - max {info}", 9809 info + "_stats_mean": f"genotype {info} Statistics - mean {info}", 9810 info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}", 9811 info 9812 + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}", 9813 } 9814 9815 # Prefix 9816 prefix = self.get_explode_infos_prefix() 9817 9818 # Field 9819 vaf_stats_infos = prefix + vaf_stats_tag 9820 9821 # Variants table 9822 table_variants = self.get_table_variants() 9823 9824 # Header 9825 vcf_reader = self.get_header() 9826 9827 # Create variant id 9828 variant_id_column = self.get_variant_id_column() 9829 added_columns = [variant_id_column] 9830 9831 # variant_id, FORMAT and samples 9832 samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join( 9833 self.get_header_sample_list() 9834 ) 9835 9836 # Create dataframe 9837 dataframe_vaf_stats = self.get_query_to_df( 9838 f""" SELECT {samples_fields} FROM {table_variants} """ 9839 ) 9840 9841 # Create vaf_stats column 9842 dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply( 9843 lambda row: genotype_stats( 9844 row, samples=self.get_header_sample_list(), info=info 9845 ), 9846 axis=1, 9847 ) 9848 9849 # List of vcf tags 9850 sql_vaf_stats_fields = [] 9851 9852 # Check all VAF stats infos 9853 for stat in vcf_infos_tags: 9854 9855 # Extract stats 9856 dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply( 9857 lambda x: dict(x).get(stat, "") 9858 ) 9859 9860 # Add snpeff_hgvs to header 9861 vcf_reader.infos[stat] = vcf.parser._Info( 9862 stat, 9863 ".", 9864 "String", 9865 vcf_infos_tags.get(stat, "genotype statistics"), 9866 "howard calculation", 9867 "0", 9868 self.code_type_map.get("String"), 9869 ) 9870 9871 if len(sql_vaf_stats_fields): 9872 sep = ";" 9873 else: 9874 sep = "" 9875 9876 # Create fields to add in INFO 9877 sql_vaf_stats_fields.append( 9878 f""" 9879 CASE 9880 WHEN dataframe_vaf_stats."{stat}" NOT NULL 9881 THEN concat( 9882 '{sep}{stat}=', 9883 dataframe_vaf_stats."{stat}" 9884 ) 9885 ELSE '' 9886 END 9887 """ 9888 ) 9889 9890 # SQL set for update 9891 sql_vaf_stats_fields_set = ", ".join(sql_vaf_stats_fields) 9892 9893 # Update 9894 sql_update = f""" 9895 UPDATE {table_variants} 9896 SET "INFO" = 9897 concat( 9898 CASE 9899 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 9900 THEN '' 9901 ELSE concat("INFO", ';') 9902 END, 9903 {sql_vaf_stats_fields_set} 9904 ) 9905 FROM dataframe_vaf_stats 9906 WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}" 9907 9908 """ 9909 self.conn.execute(sql_update) 9910 9911 # Remove added columns 9912 for added_column in added_columns: 9913 self.drop_column(column=added_column) 9914 9915 # Delete dataframe 9916 del dataframe_vaf_stats 9917 gc.collect()
The calculation_genotype_stats function calculates genotype statistics for a given information
field in a VCF file and updates the INFO column of the variants table with the calculated
statistics.
Parameters
- info: The
infoparameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
9919 def calculation_transcripts_annotation( 9920 self, info_json: str = None, info_format: str = None 9921 ) -> None: 9922 """ 9923 The `calculation_transcripts_annotation` function creates a transcripts table and adds an info 9924 field to it if transcripts are available. 9925 9926 :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method 9927 is a string parameter that represents the information field to be used in the transcripts JSON. 9928 It is used to specify the JSON format for the transcripts information. If no value is provided 9929 when calling the method, it defaults to " 9930 :type info_json: str 9931 :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation` 9932 method is a string parameter that specifies the format of the information field to be used in 9933 the transcripts JSON. It is used to define the format of the information field 9934 :type info_format: str 9935 """ 9936 9937 # Create transcripts table 9938 transcripts_table = self.create_transcript_view() 9939 9940 # Add info field 9941 if transcripts_table: 9942 self.transcript_view_to_variants( 9943 transcripts_table=transcripts_table, 9944 transcripts_info_field_json=info_json, 9945 transcripts_info_field_format=info_format, 9946 ) 9947 else: 9948 log.info("No Transcripts to process. Check param.json file configuration")
The calculation_transcripts_annotation function creates a transcripts table and adds an info
field to it if transcripts are available.
Parameters
- info_json: The
info_jsonparameter in thecalculation_transcripts_annotationmethod is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to " - info_format: The
info_formatparameter in thecalculation_transcripts_annotationmethod is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
9950 def calculation_transcripts_prioritization(self) -> None: 9951 """ 9952 The function `calculation_transcripts_prioritization` creates a transcripts table and 9953 prioritizes transcripts based on certain criteria. 9954 """ 9955 9956 # Create transcripts table 9957 transcripts_table = self.create_transcript_view() 9958 9959 # Add info field 9960 if transcripts_table: 9961 self.transcripts_prioritization(transcripts_table=transcripts_table) 9962 else: 9963 log.info("No Transcripts to process. Check param.json file configuration")
The function calculation_transcripts_prioritization creates a transcripts table and
prioritizes transcripts based on certain criteria.
9965 def calculation_transcripts_export(self) -> None: 9966 """ """ 9967 9968 # Create transcripts table 9969 transcripts_table = self.create_transcript_view() 9970 9971 # Add info field 9972 if transcripts_table: 9973 self.transcripts_export(transcripts_table=transcripts_table) 9974 else: 9975 log.info("No Transcripts to process. Check param.json file configuration")
9981 def transcripts_export( 9982 self, transcripts_table: str = None, param: dict = {} 9983 ) -> bool: 9984 """ """ 9985 9986 log.debug("Start transcripts export...") 9987 9988 # Param 9989 if not param: 9990 param = self.get_param() 9991 9992 # Param export 9993 param_transcript_export = param.get("transcripts", {}).get("export", {}) 9994 9995 # Output file 9996 transcripts_export_output = param_transcript_export.get("output", None) 9997 9998 if not param_transcript_export or not transcripts_export_output: 9999 log.warning(f"No transcriipts export parameters defined!") 10000 return False 10001 10002 # List of transcripts annotations 10003 query_describe = f""" 10004 SELECT column_name 10005 FROM ( 10006 DESCRIBE SELECT * FROM {transcripts_table} 10007 ) 10008 WHERE column_name NOT IN ('#CHROM', 'POS', 'REF', 'ALT', 'INFO') 10009 """ 10010 transcripts_annotations_list = list( 10011 self.get_query_to_df(query=query_describe)["column_name"] 10012 ) 10013 10014 # Create transcripts table for export 10015 transcripts_table_export = f"{transcripts_table}_export_" + "".join( 10016 random.choices(string.ascii_uppercase + string.digits, k=10) 10017 ) 10018 query_create_transcripts_table_export = f""" 10019 CREATE TABLE {transcripts_table_export} AS (SELECT "#CHROM", "POS", "REF", "ALT", '' AS 'INFO', {', '.join(transcripts_annotations_list)} FROM {transcripts_table}) 10020 """ 10021 self.execute_query(query=query_create_transcripts_table_export) 10022 10023 # Output file format 10024 transcripts_export_output_format = get_file_format( 10025 filename=transcripts_export_output 10026 ) 10027 10028 # Format VCF - construct INFO 10029 if transcripts_export_output_format in ["vcf"]: 10030 10031 # Construct query update INFO and header 10032 query_update_info = [] 10033 for field in transcripts_annotations_list: 10034 10035 # If field not in header 10036 if field not in self.get_header_infos_list(): 10037 10038 # Add PZ Transcript in header 10039 self.get_header().infos[field] = vcf.parser._Info( 10040 field, 10041 ".", 10042 "String", 10043 f"Annotation '{field}' from transcript view", 10044 "unknown", 10045 "unknown", 10046 0, 10047 ) 10048 10049 # Add field as INFO/tag 10050 query_update_info.append( 10051 f""" 10052 CASE 10053 WHEN "{field}" IS NOT NULL 10054 THEN concat('{field}=', "{field}", ';') 10055 ELSE '' 10056 END 10057 """ 10058 ) 10059 10060 # Query param 10061 query_update_info_value = ( 10062 f""" concat('', {", ".join(query_update_info)}) """ 10063 ) 10064 query_export_columns = f""" "#CHROM", "POS", '.' AS 'ID', "REF", "ALT", '.' AS 'QUAL', '.' AS 'FILTER', "INFO" """ 10065 10066 else: 10067 10068 # Query param 10069 query_update_info_value = f""" NULL """ 10070 query_export_columns = f""" "#CHROM", "POS", "REF", "ALT", {', '.join(transcripts_annotations_list)} """ 10071 10072 # Update query INFO column 10073 query_update = f""" 10074 UPDATE {transcripts_table_export} 10075 SET INFO = {query_update_info_value} 10076 10077 """ 10078 self.execute_query(query=query_update) 10079 10080 # Export 10081 self.export_output( 10082 output_file=transcripts_export_output, 10083 query=f""" SELECT {query_export_columns} FROM {transcripts_table_export} """, 10084 ) 10085 10086 # Drop transcripts export table 10087 query_drop_transcripts_table_export = f""" 10088 DROP TABLE {transcripts_table_export} 10089 """ 10090 self.execute_query(query=query_drop_transcripts_table_export)
10092 def transcripts_prioritization( 10093 self, transcripts_table: str = None, param: dict = {} 10094 ) -> bool: 10095 """ 10096 The `transcripts_prioritization` function prioritizes transcripts based on certain parameters 10097 and updates the variants table with the prioritized information. 10098 10099 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10100 of the table containing transcripts data. If no value is provided, it defaults to "transcripts". 10101 This parameter is used to identify the table where the transcripts data is stored for the 10102 prioritization process 10103 :type transcripts_table: str 10104 :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary 10105 that contains various configuration settings for the prioritization process of transcripts. It 10106 is used to customize the behavior of the prioritization algorithm and includes settings such as 10107 the prefix for prioritization fields, default profiles, and other 10108 :type param: dict 10109 :return: The function `transcripts_prioritization` returns a boolean value `True` if the 10110 transcripts prioritization process is successfully completed, and `False` if there are any 10111 issues or if no profile is defined for transcripts prioritization. 10112 """ 10113 10114 log.debug("Start transcripts prioritization...") 10115 10116 # Param 10117 if not param: 10118 param = self.get_param() 10119 10120 # Variants table 10121 table_variants = self.get_table_variants() 10122 10123 # Transcripts table 10124 if transcripts_table is None: 10125 transcripts_table = self.create_transcript_view( 10126 transcripts_table="transcripts", param=param 10127 ) 10128 if transcripts_table is None: 10129 msg_err = "No Transcripts table availalble" 10130 log.error(msg_err) 10131 raise ValueError(msg_err) 10132 log.debug(f"transcripts_table={transcripts_table}") 10133 10134 # Get transcripts columns 10135 columns_as_list_query = f""" 10136 DESCRIBE {transcripts_table} 10137 """ 10138 columns_as_list = list( 10139 self.get_query_to_df(columns_as_list_query)["column_name"] 10140 ) 10141 10142 # Create INFO if not exists 10143 if "INFO" not in columns_as_list: 10144 query_add_info = f""" 10145 ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT ''; 10146 """ 10147 self.execute_query(query_add_info) 10148 10149 # Prioritization param and Force only PZ Score and Flag 10150 pz_param = param.get("transcripts", {}).get("prioritization", {}) 10151 10152 # PZ profile by default 10153 pz_profile_default = ( 10154 param.get("transcripts", {}).get("prioritization", {}).get("profiles", None) 10155 ) 10156 10157 # Exit if no profile 10158 if pz_profile_default is None: 10159 log.warning("No profile defined for transcripts prioritization") 10160 return False 10161 10162 # PZ fields 10163 pz_param_pzfields = {} 10164 10165 # PZ field transcripts 10166 pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript" 10167 10168 # Add PZ Transcript in header 10169 self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info( 10170 pz_fields_transcripts, 10171 ".", 10172 "String", 10173 f"Transcript selected from prioritization process, profile {pz_profile_default}", 10174 "unknown", 10175 "unknown", 10176 code_type_map["String"], 10177 ) 10178 10179 # Mandatory fields 10180 pz_mandatory_fields_list = [ 10181 "Score", 10182 "Flag", 10183 "Tags", 10184 "Comment", 10185 "Infos", 10186 "Class", 10187 ] 10188 pz_mandatory_fields = [] 10189 for pz_mandatory_field in pz_mandatory_fields_list: 10190 pz_mandatory_fields.append( 10191 pz_param.get("pzprefix", "PTZ") + pz_mandatory_field 10192 ) 10193 10194 # PZ fields in param 10195 for pz_field in pz_param.get("pzfields", []): 10196 if pz_field in pz_mandatory_fields_list: 10197 pz_param_pzfields[pz_param.get("pzprefix", "PTZ") + pz_field] = ( 10198 pz_param.get("pzprefix", "PTZ") + pz_field 10199 ) 10200 else: 10201 pz_field_new = pz_param.get("pzprefix", "PTZ") + pz_field 10202 pz_param_pzfields[pz_field] = pz_field_new 10203 10204 # Add PZ Transcript in header 10205 self.get_header().infos[pz_field_new] = vcf.parser._Info( 10206 pz_field_new, 10207 ".", 10208 "String", 10209 f"Annotation '{pz_field}' from transcript selected from prioritization process, profile {pz_profile_default}", 10210 "unknown", 10211 "unknown", 10212 code_type_map["String"], 10213 ) 10214 10215 # PZ fields param 10216 pz_param["pzfields"] = pz_mandatory_fields 10217 10218 # Prioritization 10219 prioritization_result = self.prioritization( 10220 table=transcripts_table, 10221 pz_param=param.get("transcripts", {}).get("prioritization", {}), 10222 ) 10223 if not prioritization_result: 10224 log.warning("Transcripts prioritization not processed") 10225 return False 10226 10227 # PZ fields sql query 10228 query_update_select_list = [] 10229 query_update_concat_list = [] 10230 query_update_order_list = [] 10231 for pz_param_pzfield in set( 10232 list(pz_param_pzfields.keys()) + pz_mandatory_fields 10233 ): 10234 query_update_select_list.append(f" {pz_param_pzfield}, ") 10235 10236 for pz_param_pzfield in pz_param_pzfields: 10237 query_update_concat_list.append( 10238 f""" 10239 , CASE 10240 WHEN {pz_param_pzfield} IS NOT NULL 10241 THEN concat(';{pz_param_pzfields.get(pz_param_pzfield)}=', {pz_param_pzfield}) 10242 ELSE '' 10243 END 10244 """ 10245 ) 10246 10247 # Order by 10248 pz_orders = ( 10249 param.get("transcripts", {}) 10250 .get("prioritization", {}) 10251 .get("prioritization_transcripts_order", {}) 10252 ) 10253 if not pz_orders: 10254 pz_orders = { 10255 pz_param.get("pzprefix", "PTZ") + "Flag": "ASC", 10256 pz_param.get("pzprefix", "PTZ") + "Score": "DESC", 10257 } 10258 for pz_order in pz_orders: 10259 query_update_order_list.append( 10260 f""" {pz_order} {pz_orders.get(pz_order, "DESC")} """ 10261 ) 10262 10263 # Fields to explode 10264 fields_to_explode = ( 10265 list(pz_param_pzfields.keys()) 10266 + pz_mandatory_fields 10267 + list(pz_orders.keys()) 10268 ) 10269 # Remove transcript column as a specific transcript column 10270 if "transcript" in fields_to_explode: 10271 fields_to_explode.remove("transcript") 10272 10273 # Fields intranscripts table 10274 query_transcripts_table = f""" 10275 DESCRIBE SELECT * FROM {transcripts_table} 10276 """ 10277 query_transcripts_table = self.get_query_to_df(query=query_transcripts_table) 10278 10279 # Check fields to explode 10280 for field_to_explode in fields_to_explode: 10281 if field_to_explode not in self.get_header_infos_list() + list( 10282 query_transcripts_table.column_name 10283 ): 10284 msg_err = f"INFO/{field_to_explode} NOT IN header" 10285 log.error(msg_err) 10286 raise ValueError(msg_err) 10287 10288 # Explode fields to explode 10289 self.explode_infos( 10290 table=transcripts_table, 10291 fields=fields_to_explode, 10292 ) 10293 10294 # Transcript preference file 10295 transcripts_preference_file = ( 10296 param.get("transcripts", {}) 10297 .get("prioritization", {}) 10298 .get("prioritization_transcripts", {}) 10299 ) 10300 transcripts_preference_file = full_path(transcripts_preference_file) 10301 10302 # Transcript preference forced 10303 transcript_preference_force = ( 10304 param.get("transcripts", {}) 10305 .get("prioritization", {}) 10306 .get("prioritization_transcripts_force", False) 10307 ) 10308 # Transcript version forced 10309 transcript_version_force = ( 10310 param.get("transcripts", {}) 10311 .get("prioritization", {}) 10312 .get("prioritization_transcripts_version_force", False) 10313 ) 10314 10315 # Transcripts Ranking 10316 if transcripts_preference_file: 10317 10318 # Transcripts file to dataframe 10319 if os.path.exists(transcripts_preference_file): 10320 transcripts_preference_dataframe = transcripts_file_to_df( 10321 transcripts_preference_file 10322 ) 10323 else: 10324 log.error( 10325 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10326 ) 10327 raise ValueError( 10328 f"Transcript file '{transcripts_preference_file}' does NOT exist" 10329 ) 10330 10331 # Order by depending to transcript preference forcing 10332 if transcript_preference_force: 10333 order_by = f""" transcripts_preference.transcripts_preference_order ASC, {" , ".join(query_update_order_list)} """ 10334 else: 10335 order_by = f""" {" , ".join(query_update_order_list)}, transcripts_preference.transcripts_preference_order ASC """ 10336 10337 # Transcript columns joined depend on version consideration 10338 if transcript_version_force: 10339 transcripts_version_join = f""" {transcripts_table}.transcript = transcripts_preference.transcripts_preference """ 10340 else: 10341 transcripts_version_join = f""" split_part({transcripts_table}.transcript, '.', 1) = split_part(transcripts_preference.transcripts_preference, '.', 1) """ 10342 10343 # Query ranking for update 10344 query_update_ranking = f""" 10345 SELECT 10346 "#CHROM", POS, REF, ALT, {transcripts_table}.transcript, {" ".join(query_update_select_list)} 10347 ROW_NUMBER() OVER ( 10348 PARTITION BY "#CHROM", POS, REF, ALT 10349 ORDER BY {order_by} 10350 ) AS rn 10351 FROM {transcripts_table} 10352 LEFT JOIN 10353 ( 10354 SELECT transcript AS 'transcripts_preference', row_number() OVER () AS transcripts_preference_order 10355 FROM transcripts_preference_dataframe 10356 ) AS transcripts_preference 10357 ON {transcripts_version_join} 10358 """ 10359 10360 else: 10361 10362 # Query ranking for update 10363 query_update_ranking = f""" 10364 SELECT 10365 "#CHROM", POS, REF, ALT, transcript, {" ".join(query_update_select_list)} 10366 ROW_NUMBER() OVER ( 10367 PARTITION BY "#CHROM", POS, REF, ALT 10368 ORDER BY {" , ".join(query_update_order_list)} 10369 ) AS rn 10370 FROM {transcripts_table} 10371 """ 10372 10373 # Export Transcripts prioritization infos to variants table 10374 query_update = f""" 10375 WITH RankedTranscripts AS ( 10376 {query_update_ranking} 10377 ) 10378 UPDATE {table_variants} 10379 SET 10380 INFO = CONCAT(CASE 10381 WHEN "INFO" IS NULL OR "INFO" IN ('','.') 10382 THEN '' 10383 ELSE concat("INFO", ';') 10384 END, 10385 concat('{pz_fields_transcripts}=', transcript {" ".join(query_update_concat_list)}) 10386 ) 10387 FROM 10388 RankedTranscripts 10389 WHERE 10390 rn = 1 10391 AND variants."#CHROM" = RankedTranscripts."#CHROM" 10392 AND variants."POS" = RankedTranscripts."POS" 10393 AND variants."REF" = RankedTranscripts."REF" 10394 AND variants."ALT" = RankedTranscripts."ALT" 10395 """ 10396 10397 # log.debug(f"query_update={query_update}") 10398 self.execute_query(query=query_update) 10399 10400 # Return 10401 return True
The transcripts_prioritization function prioritizes transcripts based on certain parameters
and updates the variants table with the prioritized information.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process - param: The
paramparameter in thetranscripts_prioritizationmethod is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns
The function
transcripts_prioritizationreturns a boolean valueTrueif the transcripts prioritization process is successfully completed, andFalseif there are any issues or if no profile is defined for transcripts prioritization.
10403 def create_transcript_view_from_columns_map( 10404 self, 10405 transcripts_table: str = "transcripts", 10406 columns_maps: dict = {}, 10407 added_columns: list = [], 10408 temporary_tables: list = None, 10409 annotation_fields: list = None, 10410 column_rename: dict = {}, 10411 column_clean: bool = False, 10412 column_case: str = None, 10413 ) -> tuple[list, list, list]: 10414 """ 10415 The `create_transcript_view_from_columns_map` function generates a temporary table view based on 10416 specified columns mapping for transcripts data. 10417 10418 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10419 of the table where the transcripts data is stored or will be stored in the database. This table 10420 typically contains information about transcripts such as Ensembl transcript IDs, gene names, 10421 scores, predictions, etc. It defaults to "transcripts, defaults to transcripts 10422 :type transcripts_table: str (optional) 10423 :param columns_maps: The `columns_maps` parameter is a dictionary that contains information 10424 about how to map columns from a transcripts table to create a view. Each entry in the 10425 `columns_maps` list represents a mapping configuration for a specific set of columns. It 10426 typically includes details such as the main transcript column and additional information columns 10427 :type columns_maps: dict 10428 :param added_columns: The `added_columns` parameter in the 10429 `create_transcript_view_from_columns_map` function is a list that stores the additional columns 10430 that will be added to the view being created based on the columns map provided. These columns 10431 are generated by exploding the transcript information columns along with the main transcript 10432 column 10433 :type added_columns: list 10434 :param temporary_tables: The `temporary_tables` parameter in the 10435 `create_transcript_view_from_columns_map` function is a list that stores the names of temporary 10436 tables created during the process of creating a transcript view from a columns map. These 10437 temporary tables are used to store intermediate results or transformations before the final view 10438 is generated 10439 :type temporary_tables: list 10440 :param annotation_fields: The `annotation_fields` parameter in the 10441 `create_transcript_view_from_columns_map` function is a list that stores the fields that are 10442 used for annotation in the query view creation process. These fields are extracted from the 10443 `transcripts_column` and `transcripts_infos_columns` specified in the `columns 10444 :type annotation_fields: list 10445 :param column_rename: The `column_rename` parameter in the 10446 `create_transcript_view_from_columns_map` function is a dictionary that allows you to specify 10447 custom renaming for columns during the creation of the temporary table view. This parameter 10448 provides a mapping of original column names to the desired renamed column names. By using this 10449 parameter, 10450 :type column_rename: dict 10451 :param column_clean: The `column_clean` parameter in the 10452 `create_transcript_view_from_columns_map` function is a boolean flag that determines whether the 10453 column values should be cleaned or not. If set to `True`, the column values will be cleaned by 10454 removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to 10455 False 10456 :type column_clean: bool (optional) 10457 :param column_case: The `column_case` parameter in the `create_transcript_view_from_columns_map` 10458 function is used to specify the case transformation to be applied to the columns during the view 10459 creation process. It allows you to control whether the column values should be converted to 10460 lowercase, uppercase, or remain unchanged 10461 :type column_case: str 10462 :return: The `create_transcript_view_from_columns_map` function returns a tuple containing three 10463 lists: `added_columns`, `temporary_tables`, and `annotation_fields`. 10464 """ 10465 10466 log.debug("Start transcrpts view creation from columns map...") 10467 10468 # "from_columns_map": [ 10469 # { 10470 # "transcripts_column": "Ensembl_transcriptid", 10471 # "transcripts_infos_columns": [ 10472 # "genename", 10473 # "Ensembl_geneid", 10474 # "LIST_S2_score", 10475 # "LIST_S2_pred", 10476 # ], 10477 # }, 10478 # { 10479 # "transcripts_column": "Ensembl_transcriptid", 10480 # "transcripts_infos_columns": [ 10481 # "genename", 10482 # "VARITY_R_score", 10483 # "Aloft_pred", 10484 # ], 10485 # }, 10486 # ], 10487 10488 # Init 10489 if temporary_tables is None: 10490 temporary_tables = [] 10491 if annotation_fields is None: 10492 annotation_fields = [] 10493 10494 # Variants table 10495 table_variants = self.get_table_variants() 10496 10497 for columns_map in columns_maps: 10498 10499 # Transcript column 10500 transcripts_column = columns_map.get("transcripts_column", None) 10501 10502 # Transcripts infos columns 10503 transcripts_infos_columns = columns_map.get("transcripts_infos_columns", []) 10504 10505 # Transcripts infos columns rename 10506 column_rename = columns_map.get("column_rename", column_rename) 10507 10508 # Transcripts infos columns clean 10509 column_clean = columns_map.get("column_clean", column_clean) 10510 10511 # Transcripts infos columns case 10512 column_case = columns_map.get("column_case", column_case) 10513 10514 if transcripts_column is not None: 10515 10516 # Explode 10517 added_columns += self.explode_infos( 10518 fields=[transcripts_column] + transcripts_infos_columns 10519 ) 10520 10521 # View clauses 10522 clause_select_variants = [] 10523 clause_select_tanscripts = [] 10524 for field in [transcripts_column] + transcripts_infos_columns: 10525 10526 # AS field 10527 as_field = field 10528 10529 # Rename 10530 if column_rename: 10531 as_field = column_rename.get(as_field, as_field) 10532 10533 # Clean 10534 if column_clean: 10535 as_field = clean_annotation_field(as_field) 10536 10537 # Case 10538 if column_case: 10539 if column_case.lower() in ["lower"]: 10540 as_field = as_field.lower() 10541 elif column_case.lower() in ["upper"]: 10542 as_field = as_field.upper() 10543 10544 # Clause select Variants 10545 clause_select_variants.append( 10546 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10547 ) 10548 10549 if field in [transcripts_column]: 10550 clause_select_tanscripts.append( 10551 f""" regexp_split_to_table("{field}", ',') AS '{field}' """ 10552 ) 10553 else: 10554 clause_select_tanscripts.append( 10555 f""" regexp_split_to_table("{field}", ',') AS '{as_field}' """ 10556 ) 10557 annotation_fields.append(as_field) 10558 10559 # Querey View 10560 query = f""" 10561 SELECT 10562 "#CHROM", POS, REF, ALT, INFO, 10563 "{transcripts_column}" AS 'transcript', 10564 {", ".join(clause_select_tanscripts)} 10565 FROM ( 10566 SELECT 10567 "#CHROM", POS, REF, ALT, INFO, 10568 {", ".join(clause_select_variants)} 10569 FROM {table_variants} 10570 ) 10571 WHERE "{transcripts_column}" IS NOT NULL 10572 """ 10573 10574 # Create temporary table 10575 temporary_table = transcripts_table + "".join( 10576 random.choices(string.ascii_uppercase + string.digits, k=10) 10577 ) 10578 10579 # Temporary_tables 10580 temporary_tables.append(temporary_table) 10581 query_view = f""" 10582 CREATE TEMPORARY TABLE {temporary_table} 10583 AS ({query}) 10584 """ 10585 self.execute_query(query=query_view) 10586 10587 return added_columns, temporary_tables, annotation_fields
The create_transcript_view_from_columns_map function generates a temporary table view based on
specified columns mapping for transcripts data.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts - columns_maps: The
columns_mapsparameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in thecolumns_mapslist represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns - added_columns: The
added_columnsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_columns_mapfunction is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from thetranscripts_columnandtranscripts_infos_columnsspecified in the `columns - column_rename: The
column_renameparameter in thecreate_transcript_view_from_columns_mapfunction is a dictionary that allows you to specify custom renaming for columns during the creation of the temporary table view. This parameter provides a mapping of original column names to the desired renamed column names. By using this parameter, - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_columns_mapfunction is a boolean flag that determines whether the column values should be cleaned or not. If set toTrue, the column values will be cleaned by removing any non-alphanumeric characters from them. This cleaning process ensures, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_columns_mapfunction is used to specify the case transformation to be applied to the columns during the view creation process. It allows you to control whether the column values should be converted to lowercase, uppercase, or remain unchanged
Returns
The
create_transcript_view_from_columns_mapfunction returns a tuple containing three lists:added_columns,temporary_tables, andannotation_fields.
10589 def create_transcript_view_from_column_format( 10590 self, 10591 transcripts_table: str = "transcripts", 10592 column_formats: dict = {}, 10593 temporary_tables: list = None, 10594 annotation_fields: list = None, 10595 column_rename: dict = {}, 10596 column_clean: bool = False, 10597 column_case: str = None, 10598 ) -> tuple[list, list, list]: 10599 """ 10600 The `create_transcript_view_from_column_format` function generates a transcript view based on 10601 specified column formats, adds additional columns and annotation fields, and returns the list of 10602 temporary tables and annotation fields. 10603 10604 :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name 10605 of the table containing the transcripts data. This table will be used as the base table for 10606 creating the transcript view. The default value for this parameter is "transcripts", but you can 10607 provide a different table name if needed, defaults to transcripts 10608 :type transcripts_table: str (optional) 10609 :param column_formats: The `column_formats` parameter is a dictionary that contains information 10610 about the columns to be used for creating the transcript view. Each entry in the dictionary 10611 specifies the mapping between a transcripts column and a transcripts infos column. This 10612 parameter allows you to define how the columns from the transcripts table should be transformed 10613 or mapped 10614 :type column_formats: dict 10615 :param temporary_tables: The `temporary_tables` parameter in the 10616 `create_transcript_view_from_column_format` function is a list that stores the names of 10617 temporary views created during the process of creating a transcript view from a column format. 10618 These temporary views are used to manipulate and extract data before generating the final 10619 transcript view 10620 :type temporary_tables: list 10621 :param annotation_fields: The `annotation_fields` parameter in the 10622 `create_transcript_view_from_column_format` function is a list that stores the annotation fields 10623 that are extracted from the temporary views created during the process. These annotation fields 10624 are obtained by querying the temporary views and extracting the column names excluding specific 10625 columns like `#CH 10626 :type annotation_fields: list 10627 :param column_rename: The `column_rename` parameter in the 10628 `create_transcript_view_from_column_format` function is a dictionary that allows you to specify 10629 custom renaming of columns in the transcripts infos table. By providing a mapping of original 10630 column names to new column names in this dictionary, you can rename specific columns during the 10631 process 10632 :type column_rename: dict 10633 :param column_clean: The `column_clean` parameter in the 10634 `create_transcript_view_from_column_format` function is a boolean flag that determines whether 10635 the transcripts infos columns should undergo a cleaning process. If set to `True`, the columns 10636 will be cleaned during the creation of the transcript view based on the specified column format, 10637 defaults to False 10638 :type column_clean: bool (optional) 10639 :param column_case: The `column_case` parameter in the 10640 `create_transcript_view_from_column_format` function is used to specify the case transformation 10641 to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" 10642 to convert the column names to uppercase or lowercase, respectively 10643 :type column_case: str 10644 :return: The `create_transcript_view_from_column_format` function returns two lists: 10645 `temporary_tables` and `annotation_fields`. 10646 """ 10647 10648 log.debug("Start transcrpts view creation from column format...") 10649 10650 # "from_column_format": [ 10651 # { 10652 # "transcripts_column": "ANN", 10653 # "transcripts_infos_column": "Feature_ID", 10654 # } 10655 # ], 10656 10657 # Init 10658 if temporary_tables is None: 10659 temporary_tables = [] 10660 if annotation_fields is None: 10661 annotation_fields = [] 10662 10663 for column_format in column_formats: 10664 10665 # annotation field and transcript annotation field 10666 annotation_field = column_format.get("transcripts_column", "ANN") 10667 transcript_annotation = column_format.get( 10668 "transcripts_infos_column", "Feature_ID" 10669 ) 10670 10671 # Transcripts infos columns rename 10672 column_rename = column_format.get("column_rename", column_rename) 10673 10674 # Transcripts infos columns clean 10675 column_clean = column_format.get("column_clean", column_clean) 10676 10677 # Transcripts infos columns case 10678 column_case = column_format.get("column_case", column_case) 10679 10680 # Temporary View name 10681 temporary_view_name = transcripts_table + "".join( 10682 random.choices(string.ascii_uppercase + string.digits, k=10) 10683 ) 10684 10685 # Create temporary view name 10686 temporary_view_name = self.annotation_format_to_table( 10687 uniquify=True, 10688 annotation_field=annotation_field, 10689 view_name=temporary_view_name, 10690 annotation_id=transcript_annotation, 10691 column_rename=column_rename, 10692 column_clean=column_clean, 10693 column_case=column_case, 10694 ) 10695 10696 # Annotation fields 10697 if temporary_view_name: 10698 query_annotation_fields = f""" 10699 SELECT * 10700 FROM ( 10701 DESCRIBE SELECT * 10702 FROM {temporary_view_name} 10703 ) 10704 WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT') 10705 """ 10706 df_annotation_fields = self.get_query_to_df( 10707 query=query_annotation_fields 10708 ) 10709 10710 # Add temporary view and annotation fields 10711 temporary_tables.append(temporary_view_name) 10712 annotation_fields += list(set(df_annotation_fields["column_name"])) 10713 10714 return temporary_tables, annotation_fields
The create_transcript_view_from_column_format function generates a transcript view based on
specified column formats, adds additional columns and annotation fields, and returns the list of
temporary tables and annotation fields.
Parameters
- transcripts_table: The
transcripts_tableparameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts - column_formats: The
column_formatsparameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. This parameter allows you to define how the columns from the transcripts table should be transformed or mapped - temporary_tables: The
temporary_tablesparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view - annotation_fields: The
annotation_fieldsparameter in thecreate_transcript_view_from_column_formatfunction is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH - column_rename: The
column_renameparameter in thecreate_transcript_view_from_column_formatfunction is a dictionary that allows you to specify custom renaming of columns in the transcripts infos table. By providing a mapping of original column names to new column names in this dictionary, you can rename specific columns during the process - column_clean: The
column_cleanparameter in thecreate_transcript_view_from_column_formatfunction is a boolean flag that determines whether the transcripts infos columns should undergo a cleaning process. If set toTrue, the columns will be cleaned during the creation of the transcript view based on the specified column format, defaults to False - column_case: The
column_caseparameter in thecreate_transcript_view_from_column_formatfunction is used to specify the case transformation to be applied to the columns in the transcript view. It can be set to either "upper" or "lower" to convert the column names to uppercase or lowercase, respectively
Returns
The
create_transcript_view_from_column_formatfunction returns two lists:temporary_tablesandannotation_fields.
10716 def create_transcript_view( 10717 self, 10718 transcripts_table: str = None, 10719 transcripts_table_drop: bool = True, 10720 param: dict = {}, 10721 ) -> str: 10722 """ 10723 The `create_transcript_view` function generates a transcript view by processing data from a 10724 specified table based on provided parameters and structural information. 10725 10726 :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function 10727 is used to specify the name of the table that will store the final transcript view data. If a table 10728 name is not provided, the function will create a new table to store the transcript view data, and by 10729 default,, defaults to transcripts 10730 :type transcripts_table: str (optional) 10731 :param transcripts_table_drop: The `transcripts_table_drop` parameter in the 10732 `create_transcript_view` function is a boolean parameter that determines whether to drop the 10733 existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`, 10734 the function will drop the existing transcripts table if it exists, defaults to True 10735 :type transcripts_table_drop: bool (optional) 10736 :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that 10737 contains information needed to create a transcript view. It includes details such as the structure 10738 of the transcripts, columns mapping, column formats, and other necessary information for generating 10739 the view. This parameter allows for flexibility and customization 10740 :type param: dict 10741 :return: The `create_transcript_view` function returns the name of the transcripts table that was 10742 created or modified during the execution of the function. 10743 """ 10744 10745 log.debug("Start transcripts view creation...") 10746 10747 # Default 10748 transcripts_table_default = "transcripts" 10749 10750 # Param 10751 if not param: 10752 param = self.get_param() 10753 10754 # Struct 10755 struct = param.get("transcripts", {}).get("struct", None) 10756 10757 # Transcript veresion 10758 transcript_id_remove_version = param.get("transcripts", {}).get( 10759 "transcript_id_remove_version", False 10760 ) 10761 10762 # Transcripts mapping 10763 transcript_id_mapping_file = param.get("transcripts", {}).get( 10764 "transcript_id_mapping_file", None 10765 ) 10766 10767 # Transcripts mapping 10768 transcript_id_mapping_force = param.get("transcripts", {}).get( 10769 "transcript_id_mapping_force", None 10770 ) 10771 10772 if struct: 10773 10774 # Transcripts table 10775 if transcripts_table is None: 10776 transcripts_table = param.get("transcripts", {}).get( 10777 "table", transcripts_table_default 10778 ) 10779 10780 # added_columns 10781 added_columns = [] 10782 10783 # Temporary tables 10784 temporary_tables = [] 10785 10786 # Annotation fields 10787 annotation_fields = [] 10788 10789 # from columns map 10790 columns_maps = struct.get("from_columns_map", []) 10791 added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = ( 10792 self.create_transcript_view_from_columns_map( 10793 transcripts_table=transcripts_table, 10794 columns_maps=columns_maps, 10795 added_columns=added_columns, 10796 temporary_tables=temporary_tables, 10797 annotation_fields=annotation_fields, 10798 ) 10799 ) 10800 added_columns += added_columns_tmp 10801 temporary_tables += temporary_tables_tmp 10802 annotation_fields += annotation_fields_tmp 10803 10804 # from column format 10805 column_formats = struct.get("from_column_format", []) 10806 temporary_tables_tmp, annotation_fields_tmp = ( 10807 self.create_transcript_view_from_column_format( 10808 transcripts_table=transcripts_table, 10809 column_formats=column_formats, 10810 temporary_tables=temporary_tables, 10811 annotation_fields=annotation_fields, 10812 ) 10813 ) 10814 temporary_tables += temporary_tables_tmp 10815 annotation_fields += annotation_fields_tmp 10816 10817 # Remove some specific fields/column 10818 annotation_fields = list(set(annotation_fields)) 10819 for field in ["#CHROM", "POS", "REF", "ALT", "INFO", "transcript"]: 10820 if field in annotation_fields: 10821 annotation_fields.remove(field) 10822 10823 # Merge temporary tables query 10824 query_merge = "" 10825 for temporary_table in list(set(temporary_tables)): 10826 10827 # First temporary table 10828 if not query_merge: 10829 query_merge = f""" 10830 SELECT * FROM {temporary_table} 10831 """ 10832 # other temporary table (using UNION) 10833 else: 10834 query_merge += f""" 10835 UNION BY NAME SELECT * FROM {temporary_table} 10836 """ 10837 10838 # transcript table tmp 10839 transcript_table_tmp = "transcripts_tmp" 10840 transcript_table_tmp2 = "transcripts_tmp2" 10841 transcript_table_tmp3 = "transcripts_tmp3" 10842 10843 # Merge on transcript 10844 query_merge_on_transcripts_annotation_fields = [] 10845 10846 # Add transcript list 10847 query_merge_on_transcripts_annotation_fields.append( 10848 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.transcript)), 'string_agg', ',') AS transcript_list """ 10849 ) 10850 10851 # Aggregate all annotations fields 10852 for annotation_field in set(annotation_fields): 10853 query_merge_on_transcripts_annotation_fields.append( 10854 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp}.{annotation_field})), 'string_agg', ',') AS {annotation_field} """ 10855 ) 10856 10857 # Transcripts mapping 10858 if transcript_id_mapping_file: 10859 10860 # Transcript dataframe 10861 transcript_id_mapping_dataframe_name = "transcript_id_mapping_dataframe" 10862 transcript_id_mapping_dataframe = transcripts_file_to_df( 10863 transcript_id_mapping_file, column_names=["transcript", "alias"] 10864 ) 10865 10866 # Transcript version remove 10867 if transcript_id_remove_version: 10868 query_transcript_column_select = f"split_part({transcript_table_tmp}.transcript, '.', 1) AS transcript_original, split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1) AS transcript_mapped" 10869 query_transcript_column_group_by = f"split_part({transcript_table_tmp}.transcript, '.', 1), split_part({transcript_id_mapping_dataframe_name}.transcript, '.', 1)" 10870 query_left_join = f""" 10871 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10872 """ 10873 else: 10874 query_transcript_column_select = f"{transcript_table_tmp}.transcript AS transcript_original, {transcript_id_mapping_dataframe_name}.transcript AS transcript_mapped" 10875 query_transcript_column_group_by = f"{transcript_table_tmp}.transcript, {transcript_id_mapping_dataframe_name}.transcript" 10876 query_left_join = f""" 10877 LEFT JOIN {transcript_id_mapping_dataframe_name} ON (split_part({transcript_id_mapping_dataframe_name}.alias, '.', 1)=split_part({transcript_table_tmp}.transcript, '.', 1)) 10878 """ 10879 10880 # Transcript column for group by merge 10881 query_transcript_merge_group_by = """ 10882 CASE 10883 WHEN transcript_mapped NOT IN ('') 10884 THEN split_part(transcript_mapped, '.', 1) 10885 ELSE split_part(transcript_original, '.', 1) 10886 END 10887 """ 10888 10889 # Merge query 10890 transcripts_tmp2_query = f""" 10891 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_select}, {", ".join(query_merge_on_transcripts_annotation_fields)} 10892 FROM ({query_merge}) AS {transcript_table_tmp} 10893 {query_left_join} 10894 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column_group_by} 10895 """ 10896 10897 # Retrive columns after mege 10898 transcripts_tmp2_describe_query = f""" 10899 DESCRIBE {transcripts_tmp2_query} 10900 """ 10901 transcripts_tmp2_describe_list = list( 10902 self.get_query_to_df(query=transcripts_tmp2_describe_query)[ 10903 "column_name" 10904 ] 10905 ) 10906 10907 # Create list of columns for select clause 10908 transcripts_tmp2_describe_select_clause = [] 10909 for field in transcripts_tmp2_describe_list: 10910 if field not in [ 10911 "#CHROM", 10912 "POS", 10913 "REF", 10914 "ALT", 10915 "INFO", 10916 "transcript_mapped", 10917 ]: 10918 as_field = field 10919 if field in ["transcript_original"]: 10920 as_field = "transcripts_mapped" 10921 transcripts_tmp2_describe_select_clause.append( 10922 f""" list_aggregate(list_distinct(array_agg({transcript_table_tmp2}.{field})), 'string_agg', ',') AS {as_field} """ 10923 ) 10924 10925 # Merge with mapping 10926 query_merge_on_transcripts = f""" 10927 SELECT 10928 "#CHROM", POS, REF, ALT, INFO, 10929 CASE 10930 WHEN ANY_VALUE(transcript_mapped) NOT IN ('') 10931 THEN ANY_VALUE(transcript_mapped) 10932 ELSE ANY_VALUE(transcript_original) 10933 END AS transcript, 10934 {", ".join(transcripts_tmp2_describe_select_clause)} 10935 FROM ({transcripts_tmp2_query}) AS {transcript_table_tmp2} 10936 GROUP BY "#CHROM", POS, REF, ALT, INFO, 10937 {query_transcript_merge_group_by} 10938 """ 10939 10940 # Add transcript filter from mapping file 10941 if transcript_id_mapping_force: 10942 query_merge_on_transcripts = f""" 10943 SELECT * 10944 FROM ({query_merge_on_transcripts}) AS {transcript_table_tmp3} 10945 WHERE split_part({transcript_table_tmp3}.transcript, '.', 1) in (SELECT split_part(transcript, '.', 1) FROM transcript_id_mapping_dataframe) 10946 """ 10947 10948 # No transcript mapping 10949 else: 10950 10951 # Remove transcript version 10952 if transcript_id_remove_version: 10953 query_transcript_column = f""" 10954 split_part({transcript_table_tmp}.transcript, '.', 1) 10955 """ 10956 else: 10957 query_transcript_column = """ 10958 transcript 10959 """ 10960 10961 # Query sections 10962 query_transcript_column_select = ( 10963 f"{query_transcript_column} AS transcript" 10964 ) 10965 query_transcript_column_group_by = query_transcript_column 10966 10967 # Query for transcripts view 10968 query_merge_on_transcripts = f""" 10969 SELECT "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} AS transcript, NULL AS transcript_mapped, {", ".join(query_merge_on_transcripts_annotation_fields)} 10970 FROM ({query_merge}) AS {transcript_table_tmp} 10971 GROUP BY "#CHROM", POS, REF, ALT, INFO, {query_transcript_column} 10972 """ 10973 10974 log.debug(f"query_merge_on_transcripts={query_merge_on_transcripts}") 10975 10976 # Drop transcript view is necessary 10977 if transcripts_table_drop: 10978 query_drop = f""" 10979 DROP TABLE IF EXISTS {transcripts_table}; 10980 """ 10981 self.execute_query(query=query_drop) 10982 10983 # Merge and create transcript view 10984 query_create_view = f""" 10985 CREATE TABLE IF NOT EXISTS {transcripts_table} 10986 AS {query_merge_on_transcripts} 10987 """ 10988 self.execute_query(query=query_create_view) 10989 10990 # Remove added columns 10991 for added_column in added_columns: 10992 self.drop_column(column=added_column) 10993 10994 else: 10995 10996 transcripts_table = None 10997 10998 return transcripts_table
The create_transcript_view function generates a transcript view by processing data from a
specified table based on provided parameters and structural information.
Parameters
- transcripts_table: The
transcripts_tableparameter in thecreate_transcript_viewfunction is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts - transcripts_table_drop: The
transcripts_table_dropparameter in thecreate_transcript_viewfunction is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. Iftranscripts_table_dropis set toTrue, the function will drop the existing transcripts table if it exists, defaults to True - param: The
paramparameter in thecreate_transcript_viewfunction is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns
The
create_transcript_viewfunction returns the name of the transcripts table that was created or modified during the execution of the function.
11000 def annotation_format_to_table( 11001 self, 11002 uniquify: bool = True, 11003 annotation_field: str = "ANN", 11004 annotation_id: str = "Feature_ID", 11005 view_name: str = "transcripts", 11006 column_rename: dict = {}, 11007 column_clean: bool = False, 11008 column_case: str = None, 11009 ) -> str: 11010 """ 11011 The `annotation_format_to_table` function converts annotation data from a VCF file into a 11012 structured table format, ensuring unique values and creating a temporary table for further 11013 processing or analysis. 11014 11015 :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure 11016 unique values in the output or not. If set to `True`, the function will make sure that the 11017 output values are unique, defaults to True 11018 :type uniquify: bool (optional) 11019 :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file 11020 that contains the annotation information for each variant. This field is used to extract the 11021 annotation details for further processing in the function. By default, it is set to "ANN", 11022 defaults to ANN 11023 :type annotation_field: str (optional) 11024 :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method 11025 is used to specify the identifier for the annotation feature. This identifier will be used as a 11026 column name in the resulting table or view that is created based on the annotation data. It 11027 helps in uniquely identifying each annotation entry in the, defaults to Feature_ID 11028 :type annotation_id: str (optional) 11029 :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used 11030 to specify the name of the temporary table that will be created to store the transformed 11031 annotation data. This table will hold the extracted information from the annotation field in a 11032 structured format for further processing or analysis. By default,, defaults to transcripts 11033 :type view_name: str (optional) 11034 :param column_rename: The `column_rename` parameter in the `annotation_format_to_table` method 11035 is a dictionary that allows you to specify custom renaming for columns. By providing key-value 11036 pairs in this dictionary, you can rename specific columns in the resulting table or view that is 11037 created based on the annotation data. This feature enables 11038 :type column_rename: dict 11039 :param column_clean: The `column_clean` parameter in the `annotation_format_to_table` method is 11040 a boolean flag that determines whether the annotation field should undergo a cleaning process. 11041 If set to `True`, the function will clean the annotation field before further processing. This 11042 cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults 11043 to False 11044 :type column_clean: bool (optional) 11045 :param column_case: The `column_case` parameter in the `annotation_format_to_table` method is 11046 used to specify the case transformation to be applied to the column names extracted from the 11047 annotation data. It allows you to set the case of the column names to either lowercase or 11048 uppercase for consistency or other specific requirements during the conversion 11049 :type column_case: str 11050 :return: The function `annotation_format_to_table` is returning the name of the view created, 11051 which is stored in the variable `view_name`. 11052 """ 11053 11054 # Annotation field 11055 annotation_format = "annotation_explode" 11056 11057 # Transcript annotation 11058 if column_rename: 11059 annotation_id = column_rename.get(annotation_id, annotation_id) 11060 11061 if column_clean: 11062 annotation_id = clean_annotation_field(annotation_id) 11063 11064 # Prefix 11065 prefix = self.get_explode_infos_prefix() 11066 if prefix: 11067 prefix = "INFO/" 11068 11069 # Annotation fields 11070 annotation_infos = prefix + annotation_field 11071 annotation_format_infos = prefix + annotation_format 11072 11073 # Variants table 11074 table_variants = self.get_table_variants() 11075 11076 # Header 11077 vcf_reader = self.get_header() 11078 11079 # Add columns 11080 added_columns = [] 11081 11082 # Explode HGVS field in column 11083 added_columns += self.explode_infos(fields=[annotation_field]) 11084 11085 if annotation_field in vcf_reader.infos: 11086 11087 # Extract ANN header 11088 ann_description = vcf_reader.infos[annotation_field].desc 11089 pattern = r"'(.+?)'" 11090 match = re.search(pattern, ann_description) 11091 if match: 11092 ann_header_match = match.group(1).split(" | ") 11093 ann_header = [] 11094 ann_header_desc = {} 11095 for i in range(len(ann_header_match)): 11096 ann_header_info = "".join( 11097 char for char in ann_header_match[i] if char.isalnum() 11098 ) 11099 ann_header.append(ann_header_info) 11100 ann_header_desc[ann_header_info] = ann_header_match[i] 11101 if not ann_header_desc: 11102 raise ValueError("Invalid header description format") 11103 else: 11104 raise ValueError("Invalid header description format") 11105 11106 # Create variant id 11107 variant_id_column = self.get_variant_id_column() 11108 added_columns += [variant_id_column] 11109 11110 # Create dataframe 11111 dataframe_annotation_format = self.get_query_to_df( 11112 f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """ 11113 ) 11114 11115 # Create annotation columns 11116 dataframe_annotation_format[ 11117 annotation_format_infos 11118 ] = dataframe_annotation_format[annotation_infos].apply( 11119 lambda x: explode_annotation_format( 11120 annotation=str(x), 11121 uniquify=uniquify, 11122 output_format="JSON", 11123 prefix="", 11124 header=list(ann_header_desc.values()), 11125 ) 11126 ) 11127 11128 # Find keys 11129 query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;""" 11130 df_keys = self.get_query_to_df(query=query_json) 11131 11132 # Check keys 11133 query_json_key = [] 11134 for _, row in df_keys.iterrows(): 11135 11136 # Key 11137 key = row.iloc[0] 11138 key_clean = key 11139 11140 # key rename 11141 if column_rename: 11142 key_clean = column_rename.get(key_clean, key_clean) 11143 11144 # key clean 11145 if column_clean: 11146 key_clean = clean_annotation_field(key_clean) 11147 11148 # Key case 11149 if column_case: 11150 if column_case.lower() in ["lower"]: 11151 key_clean = key_clean.lower() 11152 elif column_case.lower() in ["upper"]: 11153 key_clean = key_clean.upper() 11154 11155 # Type 11156 query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');""" 11157 11158 # Get DataFrame from query 11159 df_json_type = self.get_query_to_df(query=query_json_type) 11160 11161 # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN 11162 with pd.option_context("future.no_silent_downcasting", True): 11163 df_json_type.fillna(value="", inplace=True) 11164 replace_dict = {None: np.nan, "": np.nan} 11165 df_json_type.replace(replace_dict, inplace=True) 11166 df_json_type.dropna(inplace=True) 11167 11168 # Detect column type 11169 column_type = detect_column_type(df_json_type[key_clean]) 11170 11171 # Append 11172 query_json_key.append( 11173 f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type} AS '{prefix}{key_clean}' """ 11174 ) 11175 11176 # Create view 11177 query_view = f""" 11178 CREATE TEMPORARY TABLE {view_name} 11179 AS ( 11180 SELECT *, {annotation_id} AS 'transcript' 11181 FROM ( 11182 SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)} 11183 FROM dataframe_annotation_format 11184 ) 11185 ); 11186 """ 11187 self.execute_query(query=query_view) 11188 11189 else: 11190 11191 # Return None 11192 view_name = None 11193 11194 # Remove added columns 11195 for added_column in added_columns: 11196 self.drop_column(column=added_column) 11197 11198 return view_name
The annotation_format_to_table function converts annotation data from a VCF file into a
structured table format, ensuring unique values and creating a temporary table for further
processing or analysis.
Parameters
- uniquify: The
uniquifyparameter is a boolean flag that determines whether to ensure unique values in the output or not. If set toTrue, the function will make sure that the output values are unique, defaults to True - annotation_field: The
annotation_fieldparameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function. By default, it is set to "ANN", defaults to ANN - annotation_id: The
annotation_idparameter in theannotation_format_to_tablemethod is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID - view_name: The
view_nameparameter in theannotation_format_to_tablemethod is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis. By default,, defaults to transcripts - column_rename: The
column_renameparameter in theannotation_format_to_tablemethod is a dictionary that allows you to specify custom renaming for columns. By providing key-value pairs in this dictionary, you can rename specific columns in the resulting table or view that is created based on the annotation data. This feature enables - column_clean: The
column_cleanparameter in theannotation_format_to_tablemethod is a boolean flag that determines whether the annotation field should undergo a cleaning process. If set toTrue, the function will clean the annotation field before further processing. This cleaning step may involve removing any unwanted characters, formatting inconsistencies, defaults to False - column_case: The
column_caseparameter in theannotation_format_to_tablemethod is used to specify the case transformation to be applied to the column names extracted from the annotation data. It allows you to set the case of the column names to either lowercase or uppercase for consistency or other specific requirements during the conversion
Returns
The function
annotation_format_to_tableis returning the name of the view created, which is stored in the variableview_name.
11200 def transcript_view_to_variants( 11201 self, 11202 transcripts_table: str = None, 11203 transcripts_column_id: str = None, 11204 transcripts_info_json: str = None, 11205 transcripts_info_field_json: str = None, 11206 transcripts_info_format: str = None, 11207 transcripts_info_field_format: str = None, 11208 param: dict = {}, 11209 ) -> bool: 11210 """ 11211 The `transcript_view_to_variants` function updates a variants table with information from 11212 transcripts in JSON format. 11213 11214 :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the 11215 table containing the transcripts data. If this parameter is not provided, the function will 11216 attempt to retrieve it from the `param` dictionary or use a default value of "transcripts" 11217 :type transcripts_table: str 11218 :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the 11219 column in the `transcripts_table` that contains the unique identifier for each transcript. This 11220 identifier is used to match transcripts with variants in the database 11221 :type transcripts_column_id: str 11222 :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name 11223 of the column in the variants table where the transcripts information will be stored in JSON 11224 format. This parameter allows you to define the column in the variants table that will hold the 11225 JSON-formatted information about transcripts 11226 :type transcripts_info_json: str 11227 :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to 11228 specify the field in the VCF header that will contain information about transcripts in JSON 11229 format. This field will be added to the VCF header as an INFO field with the specified name 11230 :type transcripts_info_field_json: str 11231 :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the 11232 format of the information about transcripts that will be stored in the variants table. This 11233 format can be used to define how the transcript information will be structured or displayed 11234 within the variants table 11235 :type transcripts_info_format: str 11236 :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to 11237 specify the field in the VCF header that will contain information about transcripts in a 11238 specific format. This field will be added to the VCF header as an INFO field with the specified 11239 name 11240 :type transcripts_info_field_format: str 11241 :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary 11242 that contains various configuration settings related to transcripts. It is used to provide 11243 default values for certain parameters if they are not explicitly provided when calling the 11244 method. The `param` dictionary can be passed as an argument 11245 :type param: dict 11246 :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True` 11247 if the operation is successful and `False` if certain conditions are not met. 11248 """ 11249 11250 msg_info_prefix = "Start transcripts view to variants annotations" 11251 11252 log.debug(f"{msg_info_prefix}...") 11253 11254 # Default 11255 transcripts_table_default = "transcripts" 11256 transcripts_column_id_default = "transcript" 11257 transcripts_info_json_default = None 11258 transcripts_info_format_default = None 11259 transcripts_info_field_json_default = None 11260 transcripts_info_field_format_default = None 11261 11262 # Param 11263 if not param: 11264 param = self.get_param() 11265 11266 # Transcripts table 11267 if transcripts_table is None: 11268 transcripts_table = param.get("transcripts", {}).get( 11269 "table", transcripts_table_default 11270 ) 11271 11272 # Transcripts column ID 11273 if transcripts_column_id is None: 11274 transcripts_column_id = param.get("transcripts", {}).get( 11275 "column_id", transcripts_column_id_default 11276 ) 11277 11278 # Transcripts info json 11279 if transcripts_info_json is None: 11280 transcripts_info_json = param.get("transcripts", {}).get( 11281 "transcripts_info_json", transcripts_info_json_default 11282 ) 11283 11284 # Transcripts info field JSON 11285 if transcripts_info_field_json is None: 11286 transcripts_info_field_json = param.get("transcripts", {}).get( 11287 "transcripts_info_field_json", transcripts_info_field_json_default 11288 ) 11289 # if transcripts_info_field_json is not None and transcripts_info_json is None: 11290 # transcripts_info_json = transcripts_info_field_json 11291 11292 # Transcripts info format 11293 if transcripts_info_format is None: 11294 transcripts_info_format = param.get("transcripts", {}).get( 11295 "transcripts_info_format", transcripts_info_format_default 11296 ) 11297 11298 # Transcripts info field FORMAT 11299 if transcripts_info_field_format is None: 11300 transcripts_info_field_format = param.get("transcripts", {}).get( 11301 "transcripts_info_field_format", transcripts_info_field_format_default 11302 ) 11303 # if ( 11304 # transcripts_info_field_format is not None 11305 # and transcripts_info_format is None 11306 # ): 11307 # transcripts_info_format = transcripts_info_field_format 11308 11309 # Variants table 11310 table_variants = self.get_table_variants() 11311 11312 # Check info columns param 11313 if ( 11314 transcripts_info_json is None 11315 and transcripts_info_field_json is None 11316 and transcripts_info_format is None 11317 and transcripts_info_field_format is None 11318 ): 11319 return False 11320 11321 # Transcripts infos columns 11322 query_transcripts_infos_columns = f""" 11323 SELECT * 11324 FROM ( 11325 DESCRIBE SELECT * FROM {transcripts_table} 11326 ) 11327 WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}') 11328 """ 11329 transcripts_infos_columns = list( 11330 self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"] 11331 ) 11332 11333 # View results 11334 clause_select = [] 11335 clause_to_json = [] 11336 clause_to_format = [] 11337 for field in transcripts_infos_columns: 11338 # Do not consider INFO field for export into fields 11339 if field not in ["INFO"]: 11340 clause_select.append( 11341 f""" regexp_split_to_table(CAST("{field}" AS STRING), ',') AS '{field}' """ 11342 ) 11343 clause_to_json.append(f""" '{field}': "{field}" """) 11344 clause_to_format.append(f""" "{field}" """) 11345 11346 # Update 11347 update_set_json = [] 11348 update_set_format = [] 11349 11350 # VCF header 11351 vcf_reader = self.get_header() 11352 11353 # Transcripts to info column in JSON 11354 if transcripts_info_json: 11355 11356 # Create column on variants table 11357 self.add_column( 11358 table_name=table_variants, 11359 column_name=transcripts_info_json, 11360 column_type="JSON", 11361 default_value=None, 11362 drop=False, 11363 ) 11364 11365 # Add header 11366 vcf_reader.infos[transcripts_info_json] = vcf.parser._Info( 11367 transcripts_info_json, 11368 ".", 11369 "String", 11370 "Transcripts in JSON format", 11371 "unknwon", 11372 "unknwon", 11373 self.code_type_map["String"], 11374 ) 11375 11376 # Add to update 11377 update_set_json.append( 11378 f""" {transcripts_info_json}=t.{transcripts_info_json} """ 11379 ) 11380 11381 # Transcripts to info field in JSON 11382 if transcripts_info_field_json: 11383 11384 log.debug(f"{msg_info_prefix} - Annotation in JSON format...") 11385 11386 # Add to update 11387 update_set_json.append( 11388 f""" 11389 INFO = concat( 11390 CASE 11391 WHEN INFO NOT IN ('', '.') 11392 THEN INFO 11393 ELSE '' 11394 END, 11395 CASE 11396 WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.') 11397 THEN concat( 11398 ';{transcripts_info_field_json}=', 11399 t.{transcripts_info_json} 11400 ) 11401 ELSE '' 11402 END 11403 ) 11404 """ 11405 ) 11406 11407 # Add header 11408 vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info( 11409 transcripts_info_field_json, 11410 ".", 11411 "String", 11412 "Transcripts in JSON format", 11413 "unknwon", 11414 "unknwon", 11415 self.code_type_map["String"], 11416 ) 11417 11418 if update_set_json: 11419 11420 # Update query 11421 query_update = f""" 11422 UPDATE {table_variants} 11423 SET {", ".join(update_set_json)} 11424 FROM 11425 ( 11426 SELECT 11427 "#CHROM", POS, REF, ALT, 11428 concat( 11429 '{{', 11430 string_agg( 11431 '"' || "{transcripts_column_id}" || '":' || 11432 to_json(json_output) 11433 ), 11434 '}}' 11435 )::JSON AS {transcripts_info_json} 11436 FROM 11437 ( 11438 SELECT 11439 "#CHROM", POS, REF, ALT, 11440 "{transcripts_column_id}", 11441 to_json( 11442 {{{",".join(clause_to_json)}}} 11443 )::JSON AS json_output 11444 FROM 11445 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11446 WHERE "{transcripts_column_id}" IS NOT NULL 11447 ) 11448 GROUP BY "#CHROM", POS, REF, ALT 11449 ) AS t 11450 WHERE {table_variants}."#CHROM" = t."#CHROM" 11451 AND {table_variants}."POS" = t."POS" 11452 AND {table_variants}."REF" = t."REF" 11453 AND {table_variants}."ALT" = t."ALT" 11454 """ 11455 11456 self.execute_query(query=query_update) 11457 11458 # Transcripts to info column in FORMAT 11459 if transcripts_info_format: 11460 11461 # Create column on variants table 11462 self.add_column( 11463 table_name=table_variants, 11464 column_name=transcripts_info_format, 11465 column_type="VARCHAR", 11466 default_value=None, 11467 drop=False, 11468 ) 11469 11470 # Add header 11471 vcf_reader.infos[transcripts_info_format] = vcf.parser._Info( 11472 transcripts_info_format, 11473 ".", 11474 "String", 11475 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11476 "unknwon", 11477 "unknwon", 11478 self.code_type_map["String"], 11479 ) 11480 11481 # Add to update 11482 update_set_format.append( 11483 f""" {transcripts_info_format}=t.{transcripts_info_format} """ 11484 ) 11485 11486 else: 11487 11488 # Set variable for internal queries 11489 transcripts_info_format = "transcripts_info_format" 11490 11491 # Transcripts to info field in JSON 11492 if transcripts_info_field_format: 11493 11494 log.debug(f"{msg_info_prefix} - Annotation in structured format...") 11495 11496 # Add to update 11497 update_set_format.append( 11498 f""" 11499 INFO = concat( 11500 CASE 11501 WHEN INFO NOT IN ('', '.') 11502 THEN INFO 11503 ELSE '' 11504 END, 11505 CASE 11506 WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.') 11507 THEN concat( 11508 ';{transcripts_info_field_format}=', 11509 t.{transcripts_info_format} 11510 ) 11511 ELSE '' 11512 END 11513 ) 11514 """ 11515 ) 11516 11517 # Add header 11518 vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info( 11519 transcripts_info_field_format, 11520 ".", 11521 "String", 11522 f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'", 11523 "unknwon", 11524 "unknwon", 11525 self.code_type_map["String"], 11526 ) 11527 11528 if update_set_format: 11529 11530 # Update query 11531 query_update = f""" 11532 UPDATE {table_variants} 11533 SET {", ".join(update_set_format)} 11534 FROM 11535 ( 11536 SELECT 11537 "#CHROM", POS, REF, ALT, 11538 string_agg({transcripts_info_format}) AS {transcripts_info_format} 11539 FROM 11540 ( 11541 SELECT 11542 "#CHROM", POS, REF, ALT, 11543 "{transcripts_column_id}", 11544 concat( 11545 "{transcripts_column_id}", 11546 '|', 11547 {", '|', ".join(clause_to_format)} 11548 ) AS {transcripts_info_format} 11549 FROM 11550 (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table}) 11551 ) 11552 GROUP BY "#CHROM", POS, REF, ALT 11553 ) AS t 11554 WHERE {table_variants}."#CHROM" = t."#CHROM" 11555 AND {table_variants}."POS" = t."POS" 11556 AND {table_variants}."REF" = t."REF" 11557 AND {table_variants}."ALT" = t."ALT" 11558 """ 11559 11560 self.execute_query(query=query_update) 11561 11562 return True
The transcript_view_to_variants function updates a variants table with information from
transcripts in JSON format.
Parameters
- transcripts_table: The
transcripts_tableparameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from theparamdictionary or use a default value of "transcripts" - transcripts_column_id: The
transcripts_column_idparameter is used to specify the column in thetranscripts_tablethat contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database - transcripts_info_json: The
transcripts_info_jsonparameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts - transcripts_info_field_json: The
transcripts_info_field_jsonparameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name - transcripts_info_format: The
transcripts_info_formatparameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table - transcripts_info_field_format: The
transcripts_info_field_formatparameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name - param: The
paramparameter in thetranscript_view_to_variantsmethod is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. Theparamdictionary can be passed as an argument
Returns
The function
transcript_view_to_variantsreturns a boolean value. It returnsTrueif the operation is successful andFalseif certain conditions are not met.